In [2]:
!pip install transformers

In [3]:
# Standard libraries
import os
import random
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

# Deep learning and array processing
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Model-specific imports
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Utilities and metrics
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score


In [4]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
path = "/content/drive/MyDrive/cs688project2/data/"

In [6]:
test_data = pd.read_json(path+'dev_seen.jsonl',lines=True)
dev_unseen_data= pd.read_json(path+'dev_unseen.jsonl',lines=True)
final_test_data=pd.read_json(path+'test_seen.jsonl',lines=True)
test_unseen_data=pd.read_json(path+'test_unseen.jsonl',lines=True)
train_data=pd.read_json(path+'train.jsonl',lines=True)
test_unseen_data.head()

Unnamed: 0,id,img,label,text
0,15740,img/15740.png,1,when someone tells you how to bbq
1,38794,img/38794.png,1,when they say white folks don't know how to cook
2,60792,img/60792.png,1,the original derp-face
3,71824,img/71824.png,1,okay here you go! you piece of shit!
4,4796,img/04796.png,1,xboxone farming 1619 simulator


In [7]:
# Initialize the tokenizer from the 'bert-base-uncased' model checkpoint.
# The text will be converted to lowercase because 'do_lower_case' is set to True.
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
def encode_data(tokenizer, texts, max_length=256):
    """Encode texts using the given tokenizer.

    Args:
        tokenizer: Tokenizer object used for encoding.
        texts (array-like): List of texts to be encoded.
        max_length (int): Maximum sequence length for the encoded text.

    Returns:
        dict: Dictionary containing input_ids and attention_mask.
    """
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )

# Encode the training and testing data
encoded_data_train = encode_data(tokenizer, train_data.text.values)
encoded_data_val = encode_data(tokenizer, test_data.text.values)

# Extract 'input_ids' and 'attention_mask' from the encoded data for training set
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_data.label.values)

# Extract 'input_ids' and 'attention_mask' from the encoded data for validation set
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(test_data.label.values)




In [9]:
# Create a TensorDataset from encoded training data.
# This will allow the DataLoader to efficiently manage batches from these tensors.
# - input_ids_train: Tensor of token ids for each sentence in the training set.
# - attention_masks_train: Tensor indicating the position of the tokens that should be attended to.
# - labels_train: Tensor of labels for each sentence in the training set.
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

# Similarly, create a TensorDataset for the validation data.
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)


In [10]:

# Load the pre-trained BERT model for sequence classification.
# - 'bert-base-uncased': Specifies the BERT model to be used. Here, the base-sized BERT model with uncased vocab is used.
# - num_labels: Specifies the number of unique labels in the dataset. This defines the number of output units in the last layer.
# - output_attentions: If True, the model will return the attention weights, otherwise not.
# - output_hidden_states: If True, the model will return all hidden-states, otherwise not.
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(train_data.label.unique()),
    output_attentions=False,
    output_hidden_states=False
)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Set batch size for data loading.
batch_size = 32

# Create a dataloader for the training dataset.
# - dataset_train: The training dataset constructed from input IDs, attention masks, and labels.
# - RandomSampler(dataset_train): A sampler that returns random indices from the dataset, ensuring each data point is used once per epoch.
# - batch_size: Number of samples per batch.
dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

# Create a dataloader for the validation dataset.
# Similar to the training dataloader but for the validation set.
dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)


In [12]:
# Initialize the AdamW optimizer from the transformers library.
# AdamW is a class of Adam optimizers which stands for 'Adam Weight Decay fix'.
# It's recommended for use with transformer models.

# Parameters:
# - model.parameters(): All the parameters of the BERT model.
# - lr=1e-5: Learning rate. This is the rate at which model adjusts based on errors it makes.
# - eps=1e-8: Epsilon. A small number to prevent division by zero.

optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)




In [13]:
# Define the number of epochs for training.
# An epoch is one complete forward and backward pass of all the training examples.
epochs = 30

# Initialize a learning rate scheduler.
# This scheduler adjusts the learning rate over time, starting with a higher rate and reducing it.
# A linear scheduler with warmup increases the learning rate linearly for a given number of warmup steps.
# After warmup, it then decreases the learning rate linearly.

# Parameters:
# - optimizer: The optimizer to adjust the learning rate for.
# - num_warmup_steps=0: Number of warm-up steps. During these steps, the learning rate increases.
# - num_training_steps: Total number of training steps, calculated as number of batches times epochs.

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train) * epochs
)


In [14]:
def auc(preds, labels):
    """
    Compute the Area Under the Receiver Operating Characteristic Curve (AUC ROC) from prediction scores.

    Parameters:
    - preds (array-like): Predicted values. For multi-class, this should be the probabilities of each class.
    - labels (array-like): True labels.

    Returns:
    - float: AUC ROC score.
    """

    # Flatten and get the class with the highest probability as the prediction for each sample
    preds_flat = np.argmax(preds, axis=1).flatten()

    # Flatten the true labels
    labels_flat = labels.flatten()

    # Calculate the AUC ROC score using the true labels and predicted probabilities
    return roc_auc_score(labels_flat, preds_flat)


In [15]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [16]:
# Determine the device (CUDA if available, else CPU) to which the model should be moved
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the appropriate device for computations (either GPU or CPU)
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
def evaluate(dataloader_val):
    """
    Evaluate the model on validation data.

    Parameters:
    - dataloader_val (DataLoader): DataLoader for the validation set.

    Returns:
    - tuple: Average validation loss, concatenated predictions and true values.
    """

    # Set the model to evaluation mode
    model.eval()

    # Initialize the total validation loss to 0
    loss_val_total = 0
    predictions, true_vals = [], []

    # Iterate over batches from the validation dataloader
    for batch in tqdm(dataloader_val):

        # Move the batch tensors to the same device as the model
        batch = tuple(b.to(device) for b in batch)

        # Create a dictionary of input tensors for the model
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2]
        }

        # Disable gradient computations (since we are in evaluation mode)
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract loss and logits from the outputs
        loss = outputs[0]
        logits = outputs[1]

        # Accumulate the validation loss
        loss_val_total += loss.item()

        # Detach and move logits and labels to the CPU
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()

        # Append current batch's predictions and true labels to lists
        predictions.append(logits)
        true_vals.append(label_ids)

    # Compute the average validation loss
    loss_val_avg = loss_val_total / len(dataloader_val)

    # Concatenate all predictions and true values from each batch into single arrays
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


In [18]:
for epoch in tqdm(range(1, epochs+1)):
    """
    Main training loop.
    """

    # Put model in training mode
    model.train()

    # Initialize the total training loss to 0
    loss_train_total = 0

    # Define a progress bar for the training batches
    progress_bar = tqdm(dataloader_train,
                        desc=f'Epoch {epoch}',
                        leave=False,
                        disable=False)

    # Iterate over training batches
    for batch in progress_bar:

        # Reset the gradients before processing a new batch
        model.zero_grad()

        # Move the batch tensors to the same device as the model
        batch = tuple(b.to(device) for b in batch)

        # Create a dictionary of input tensors for the model
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        # Fetch outputs by passing inputs to the model
        outputs = model(**inputs)

        # Extract the loss from the outputs
        loss = outputs[0]

        # Accumulate the training loss
        loss_train_total += loss.item()

        # Backward pass to compute gradients
        loss.backward()

        # Clip the gradients to prevent them from exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update model parameters
        optimizer.step()

        # Update the learning rate
        scheduler.step()

        # Update the progress bar with the current batch's loss
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    # Save the model after each epoch
    torch.save(model.state_dict(), f'/content/drive/MyDrive/cs688project2/data/models/BERT2_ft_epoch{epoch}.model')

    # Display the epoch number
    tqdm.write(f'\nEpoch {epoch}')

    # Compute average training loss for the epoch
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    # Evaluate the model on validation data
    val_loss, predictions, true_value = evaluate(dataloader_val)

    # Compute AUC for the validation predictions
    val_AUC = auc(predictions, true_value)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'AUC Score: {val_AUC}')


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/266 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.5753253666978133


  0%|          | 0/16 [00:00<?, ?it/s]

Validation loss: 0.764732800424099
AUC Score: 0.5767230481189292


Epoch 2:   0%|          | 0/266 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# load a pretrained model
model.load_state_dict(
    torch.load('/content/drive/MyDrive/cs688project2/data/models/BERT2_ft_epoch22.model',
    map_location = torch.device('cpu'))
)

In [None]:
# Get the predictions for the hold out test set
loss_value_avg, predictions, true_vals = evaluate(dataloader_val)

In [None]:
# Get the AUC on the hold out test set
final_test_auc = auc(predictions, true_vals)
acc = accuracy_score(pred, true_vals)

print(final_test_auc)
print(acc)
