In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
! pip install transformers
! pip install datasets
! pip install --upgrade tqdm
! pip install -U accelerate
! pip install pytorch-lightning



In [49]:
# Standard libraries
import argparse
import logging
import os
import shutil
import time
from pathlib import Path
from string import punctuation

# Third-party libraries
import numpy as np
import pandas as pd
from PIL import Image
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import class_weight

# Transformers and related
import accelerate
import transformers
from transformers import (
    AdamW,
    BertTokenizer,
    VisualBertForPreTraining,
    VisualBertConfig,
    VisualBertModel,
    ViTFeatureExtractor,
    ViTModel,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from datasets import load_metric

In [50]:
# Set the directory path for the model checkpoint
dirpath = '/content/drive/Mydrive/css688project2/model-checkpoint'

# Check if the specified path exists and is a directory
if os.path.exists(dirpath) and os.path.isdir(dirpath):
    # If so, remove the directory and its contents
    shutil.rmtree(dirpath)

In [51]:
#Link to the path of the data folder
path = "/content/drive/MyDrive/cs688project2/data/"

In [52]:
#Read the data file with all data and split into train and test
df = pd.read_csv(path + 'final_cleaned_with_all_tags.csv')
df_train = df[:8500]
df_val = df[8500:9540]

In [53]:
# Convert the 'id' column to a string type and pad it with zeros to a width of 5 characters for the training dataframe
df_train['idx'] = df_train['id'].astype(str).str.zfill(5)

# Convert the 'id' column to a string type and pad it with zeros to a width of 5 characters for the validation dataframe
df_val['idx'] = df_val['id'].astype(str).str.zfill(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['idx'] = df_train['id'].astype(str).str.zfill(5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['idx'] = df_val['id'].astype(str).str.zfill(5)


## Compute Class Weight

In [54]:
# Extract the "label" column from the training dataframe and convert it to a list.
y_train = df_train["label"].values.tolist()

# Compute class weights using the 'balanced' strategy. This is useful when dealing with imbalanced datasets.
# The 'balanced' mode uses the formula n_samples / (n_classes * np.bincount(y)) to compute weights.
# It adjusts weights inversely proportional to class frequencies in the input data.
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',          # Mode for calculating weights
    classes=np.unique(y_train),       # Unique class labels in the training data
    y=y_train                         # Class labels for the entire training dataset
)

# Print the computed class weights.
print(class_weights)


[0.77540595 1.40775091]


## Load Visual Embedding features

In [55]:
# Initialize the tokenizer using the pretrained 'bert-base-uncased' model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Initialize the feature extractor for the Vision Transformer using the pretrained 'google/vit-base-patch16-224-in21k' model
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# Initialize the Vision Transformer model using the pretrained 'google/vit-base-patch16-224-in21k' and move it to the GPU ('cuda')
feature_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to('cuda')




In [56]:
class HatefulMemesData(Dataset):
    def __init__(self, df, tokenizer, sequence_length, print_text=False):
        """Constructor for the custom dataset.

        Parameters:
        - df (DataFrame): DataFrame containing the dataset information.
        - tokenizer: Tokenizer object for text processing.
        - sequence_length (int): Maximum length of token sequences.
        - print_text (bool): If True, prints text during tokenization.
        """

        self.sequence_length = sequence_length
        self.tokenizer = tokenizer
        self.print_text = print_text

        # Extract columns from the DataFrame
        texts = df["text"].values.tolist()
        labels = df["label"].values.tolist()
        images = df["img"].values.tolist()
        ids = df["idx"].values.tolist()

        # Construct a list of dictionaries from the extracted columns
        self.dataset = []
        for i, inp in enumerate(texts):
            self.dataset.append({"text": inp, "label": labels[i], 'idx': ids[i], 'image': images[i]})

    def __len__(self):
        """Returns the number of items in the dataset."""
        return len(self.dataset)

    def tokenize_data(self, example):
        """Tokenize data for both text and image information.

        Parameters:
        - example (dict): A dictionary containing text, label, idx, and image data.

        Returns:
        - dict: A dictionary containing tokenized information.
        """

        idx = example['idx']
        idx = [idx] if isinstance(idx, str) else idx

        # Tokenize the text data
        encoded_dict = tokenizer(example['text'], padding='max_length', max_length=self.sequence_length, truncation=True, return_tensors='pt')
        tokens = encoded_dict['input_ids']
        token_type_ids = encoded_dict['token_type_ids']
        attn_mask = encoded_dict['attention_mask']
        targets = torch.tensor(example['label']).type(torch.int64)

        # Process the image data
        try:
            img = example['image']
            img = Image.open(os.path.join('hateful_memes', img))
            img = np.array(img)
            img = img[..., :3]
            inputs = feature_extractor(images=img, return_tensors="pt")
            outputs = feature_model(**inputs.to('cuda'))
            visual_embeds = outputs.last_hidden_state
            visual_embeds = visual_embeds.cpu()
        except:
            visual_embeds = np.zeros(shape=(197, 768), dtype=float)

        # Create masks for visual embeddings
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)

        # Construct the inputs dictionary for the model
        inputs = {
            "input_ids": tokens.squeeze(),
            "attention_mask": attn_mask.squeeze(),
            "token_type_ids": token_type_ids.squeeze(),
            "visual_embeds": visual_embeds.squeeze(),
            "visual_token_type_ids": visual_token_type_ids.squeeze(),
            "visual_attention_mask": visual_attention_mask.squeeze(),
            "label": targets.squeeze()
        }

        return inputs

    def __getitem__(self, index):
        """Returns the tokenized data for a given index.

        Parameters:
        - index (int): Index to retrieve data from.

        Returns:
        - dict: Tokenized data for the given index.
        """

        inputs = self.tokenize_data(self.dataset[index])

        # Print the tokenized data shapes and datatypes if print_text is True
        if self.print_text:
            for k in inputs.keys():
                print(k, inputs[k].shape, inputs[k].dtype)

        return inputs

In [57]:
#Create a dataset instance
dataset = HatefulMemesData(df_val, tokenizer, 50, True)

In [58]:

weights = [0.77540595, 1.40775091]
wt_tensor = torch.FloatTensor(weights).cuda()
print(wt_tensor)

tensor([0.7754, 1.4078], device='cuda:0')


In [59]:
class VisualBERTClassifier(torch.nn.Module):
    def __init__(self):
        """
        Initialize the necessary layers and configurations for the VisualBERT model.
        """
        super(VisualBERTClassifier, self).__init__()

        # Load the configuration for the VisualBert model with modified dropout rates
        configuration = VisualBertConfig.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre',
                                                hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)

        # Load the pretrained VisualBert model with the specified configuration
        self.visualbert = VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', config=configuration)

        # Define a linear layer to transform visual embeddings
        self.embed_cls = nn.Linear(768, 1024)

        # Define the number of output labels
        self.num_labels = 2

        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.3)

        # Final classifier layer to map to the number of labels
        self.cls = nn.Linear(768, self.num_labels)

        # Sample class weights
        self.weight = torch.FloatTensor([class_weights])

        # Calculate normalized weights based on sample counts for classes
        nSamples = [5178, 2849]
        normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
        # Loss function with weighted classes
        self.loss_fct = CrossEntropyLoss(weight=torch.FloatTensor(normedWeights))

    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask,
                visual_token_type_ids, labels):
        """
        In the forward pass, the model processes input data and returns the loss and logits.
        """
        # Apply a linear transformation to the visual embeddings
        visual_embeds_cls = self.embed_cls(visual_embeds)

        # Pass the input data through the VisualBert model
        outputs = self.visualbert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                visual_embeds=visual_embeds_cls,
                visual_attention_mask=visual_attention_mask,
                visual_token_type_ids=visual_token_type_ids,
            )

        # Extract the pooled output (representative embedding) from the model's output
        pooled_output = outputs[1]

        # Apply dropout for regularization
        pooled_output = self.dropout(pooled_output)

        # Obtain logits by passing the pooled output through the classifier layer
        logits = self.cls(pooled_output)

        # Reshape logits to match the expected shape
        reshaped_logits = logits.view(-1, self.num_labels)

        # Calculate the cross-entropy loss between the predicted logits and true labels
        loss = self.loss_fct(reshaped_logits, labels.view(-1))

        return loss, reshaped_logits

In [60]:
# Initialize the VisualBERTClassifier model, and move it to the GPU (CUDA) for faster computation.
model = VisualBERTClassifier().to('cuda')


## Using HuggingFace Trainer

In [61]:
# Define the metric name for evaluation. In this case, it's the Area Under the Receiver Operating Characteristic curve (AUROC).
metric_name = "auroc"

# Initialize the TrainingArguments for the HuggingFace Trainer. This defines various training hyperparameters and settings.
args = TrainingArguments(
    output_dir="model-checkpoint",  # Directory to save model checkpoints
    seed=42,  # Random seed for reproducibility
    evaluation_strategy="steps",  # Evaluation and logging are done at regular steps instead of at the end of the epoch
    learning_rate=1e-5,  # Initial learning rate
    per_device_train_batch_size=24,  # Batch size for training
    per_device_eval_batch_size=24,  # Batch size for evaluation
    num_train_epochs=30,  # Total number of training epochs to perform
    weight_decay=0.05,  # Weight decay (L2 penalty) to apply
    load_best_model_at_end=True,  # Load the best model found during training (in terms of evaluation metric)
    metric_for_best_model=metric_name,  # Metric to use to identify the best model
    eval_steps=50,  # Evaluate the model every 50 steps
    save_steps=500,  # Save a model checkpoint every 500 steps
    fp16=False,  # Whether to use 16-bit (mixed) precision instead of 32-bit
    gradient_accumulation_steps=2  # Number of updates steps to accumulate gradients before performing an optimization step
)


In [62]:
#Load acc metric
acc_metric = load_metric('accuracy')


def compute_metrics(eval_pred):
    """
    Compute accuracy and AUROC metrics based on model's predictions and true labels.

    Parameters:
    - eval_pred: tuple containing the predicted logits and true labels.

    Returns:
    - Dictionary containing the accuracy and AUROC scores.
    """

    # Split the eval_pred tuple into logits and true labels
    logits, labels = eval_pred

    # Get the class with the highest probability as the predicted class
    predictions = np.argmax(logits, axis=-1)

    # Compute accuracy using the provided accuracy metric function
    acc = acc_metric.compute(predictions=predictions, references=labels)

    # Compute the Area Under the Receiver Operating Characteristic curve (AUROC)
    auc_score = roc_auc_score(labels, predictions)

    # Return the computed metrics in a dictionary
    return {"accuracy": acc['accuracy'], "auroc": auc_score}

In [63]:
# Initialize the HuggingFace Trainer. The Trainer is a utility class provided by the Transformers library
# to simplify the training and evaluation of models.

trainer = Trainer(
    model,  # The instantiated model to be trained
    args,  # Training configuration (hyperparameters, evaluation strategy, etc.)

    # Train dataset: 'HatefulMemesData' is a custom Dataset class. We initialize it with training data,
    # a tokenizer, and a specified sequence length for tokenization.
    train_dataset = HatefulMemesData(df_train, tokenizer=tokenizer, sequence_length=50),

    # Evaluation dataset: Similar to the training dataset, but using validation data.
    eval_dataset =  HatefulMemesData(df_val, tokenizer=tokenizer, sequence_length=50),

    tokenizer=tokenizer,  # The tokenizer used for encoding the text data
    compute_metrics=compute_metrics  # Function to compute custom evaluation metrics
)


In [64]:
#Fine-tuning the model
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Auroc
50,No log,0.726029,0.490385,0.5
100,No log,0.699542,0.496154,0.504994
150,No log,0.707018,0.491346,0.499834


TrainOutput(global_step=177, training_loss=0.6976944228350106, metrics={'train_runtime': 788.7444, 'train_samples_per_second': 10.777, 'train_steps_per_second': 0.224, 'total_flos': 0.0, 'train_loss': 0.6976944228350106, 'epoch': 1.0})

In [65]:
#Evaluating the model on test set
trainer.evaluate()

{'eval_loss': 0.7078133821487427,
 'eval_accuracy': 0.49230769230769234,
 'eval_auroc': 0.5007769145394007,
 'eval_runtime': 46.5062,
 'eval_samples_per_second': 22.363,
 'eval_steps_per_second': 0.946,
 'epoch': 1.0}

In [66]:
#Save the model for inference
trainer.save_model('/content/drive/MyDrive/cs688project2/data/VisualBERT_classification_model_without_tags')