In [1]:
# Download Dataset
!git lfs install
!git clone https://huggingface.co/datasets/neuralcatcher/hateful_memes
!cd hateful_memes

Git LFS initialized.
Cloning into 'hateful_memes'...
remote: Enumerating objects: 9700, done.[K
remote: Counting objects: 100% (9700/9700), done.[K
remote: Compressing objects: 100% (9682/9682), done.[K
remote: Total 9700 (delta 17), reused 9700 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (9700/9700), 1.85 MiB | 4.23 MiB/s, done.
Resolving deltas: 100% (17/17), done.
Updating files: 100% (9672/9672), done.
Filtering content: 100% (9664/9664), 3.13 GiB | 35.98 MiB/s, done.


In [2]:
# 3 Part 1: Text-Only Classification
# 3.1 Step 1: Generating Text Embeddings

import torch
from transformers import AutoTokenizer, AutoModel

# Load BERT tokenizer and model from huggingface
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set the model to evaluation mode since we won ’t train it
model.eval()

# Example text
texts = ["This is the first sentence.", "This is the second sentence."]

# Tokenize and encode the text inputs
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

# Pass the inputs through the BERT model
with torch.no_grad():
    outputs = model(**inputs)

# Extract the sentence embeddings
sentence_embeddings = outputs.last_hidden_state[:, 0, :]

print(sentence_embeddings.shape)

# Save embeddings to a file
torch.save(sentence_embeddings, "sentence_embeddings.pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


torch.Size([2, 768])


  loaded_embeddings = torch.load("sentence_embeddings.pt")


In [7]:
# 3.2 Step 2: Building the Classifier

import torch
import torch.nn as nn

# Define a simple fully connected neural network
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        # Define the layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Forward pass through the network
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize the model
input_dim = 768
hidden_dim = 256
output_dim = 1

# Create the untrained model based on this architecture
model = TextClassifier(input_dim, hidden_dim, output_dim)

In [23]:
# 3.3 Step 3: Training and Evaluation
# Dev set results

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Number of epochs for training
num_epochs = 10

# Example data
sentence_embeddings = torch.randn(100, 768)
labels = torch.randint(0, 2, (100, 1), dtype=torch.float32)

# Set the model to training mode
model.train()

# Training loop
for epoch in range(num_epochs):
    optimizer.zero_grad()

    # Forward pass
    outputs = model(sentence_embeddings)

    # Compute the loss
    loss = criterion(outputs, labels)

    # Backward pass
    loss.backward()
    optimizer.step()

    # Print the loss after each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Set the model to evaluation mode for evaluation
model.eval()

# Evaluation on the dev set (use dev set data here)
with torch.no_grad():
    dev_outputs = model(sentence_embeddings)

    # Apply sigmoid to convert logits to probabilities and round to get predictions (0 or 1)
    dev_preds = torch.sigmoid(dev_outputs).round()

# Compute evaluation metrics
accuracy = accuracy_score(labels, dev_preds)
precision = precision_score(labels, dev_preds)
recall = recall_score(labels, dev_preds)
f1 = f1_score(labels, dev_preds)
auc_roc = roc_auc_score(labels, torch.sigmoid(dev_outputs))

# Print evaluation results
print(f"Text Model - Dev Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")

Epoch 1/10, Loss: 0.7102136015892029
Epoch 2/10, Loss: 0.5504420399665833
Epoch 3/10, Loss: 0.42620375752449036
Epoch 4/10, Loss: 0.328424870967865
Epoch 5/10, Loss: 0.2517696022987366
Epoch 6/10, Loss: 0.1923561990261078
Epoch 7/10, Loss: 0.1465851217508316
Epoch 8/10, Loss: 0.11161170899868011
Epoch 9/10, Loss: 0.08500272780656815
Epoch 10/10, Loss: 0.06489279121160507
Text Model - Dev Set Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0
AUC-ROC: 1.0


In [5]:
# Save the model after training
torch.save(model.state_dict(), 'text_classifier.pth')

In [3]:
import torch

# Load text embeddings
text_embeddings_path = '/content/sentence_embeddings.pt'
test_text_embeddings = torch.load(text_embeddings_path)

  test_text_embeddings = torch.load(text_embeddings_path) # Load with torch.load instead of np.load


In [35]:
# Text-only / Test set results

import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


test_text_embeddings = torch.randn(100, 768)
test_labels = torch.randint(0, 2, (100, 1), dtype=torch.float32)

# Load the trained model
text_model = TextClassifier(input_dim=768, hidden_dim=256, output_dim=1)
text_model.load_state_dict(torch.load('/content/text_classifier.pth'))


# Set model to evaluation mode
text_model.eval()

# Get predictions
with torch.no_grad():
    test_text_outputs = text_model(test_text_embeddings)
    test_text_preds = torch.sigmoid(test_text_outputs).round().squeeze()

# Convert to numpy for sklearn metrics
test_text_preds = test_text_preds.cpu().numpy()
test_labels_np = test_labels.squeeze().cpu().numpy()

# Calculate and print evaluation metrics
accuracy = accuracy_score(test_labels_np, test_text_preds)
precision = precision_score(test_labels_np, test_text_preds)
recall = recall_score(test_labels_np, test_text_preds)
f1 = f1_score(test_labels_np, test_text_preds)
auc_roc = roc_auc_score(test_labels_np, torch.sigmoid(test_text_outputs).cpu().numpy())

print(f"Text Model - Test Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")

Text Model - Test Set Results:
Accuracy: 0.57
Precision: 0.525
Recall: 0.4666666666666667
F1-Score: 0.49411764705882355
AUC-ROC: 0.5595959595959596


  text_model.load_state_dict(torch.load('/content/text_classifier.pth'))


In [11]:
# 4 Part 2: Image-Only Classification
# 4.1 Step 1: Generating image embeddings

from torchvision import transforms
from PIL import Image
import os

# Define the preprocessing pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    # Normalize with the mean and std values used for ResNet training
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Path to the folder containing images
image_folder = '/content/hateful_memes/img'

# List to store the processed images
processed_images = []

# Iterate through all files in the folder
for filename in os.listdir(image_folder):
    if filename.endswith(('.jpg', '.png', '.jpeg')):
        # Construct the full path to the image
        image_path = os.path.join(image_folder, filename)

        # Open the image using PIL
        image = Image.open(image_path).convert('RGB')

        # Apply the transformations
        image = transform(image)

        # Add the processed image to the list
        processed_images.append(image)

In [12]:
import torch
import torchvision.models as models
import torch.nn as nn

# Add a batch dimension
image = image.unsqueeze(0)

# Load the pretrained ResNet18 model
model = models.resnet18(pretrained=True)

# Remove the final classification layer to get the raw feature vector (embedding)
resnet_feature_extractor = nn.Sequential(*list(model.children())[:-1])

# Pass the image through ResNet to get the feature representation
with torch.no_grad():
    image_representation = resnet_feature_extractor(image)

# Reshape the output from [1, 512, 1, 1] to [1, 512] to get a 512-dimensional feature vector
image_representation = image_representation.view(image_representation.size(0), -1)  # Shape: [1, 512]

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 53.0MB/s]


In [13]:
# Save the embeddings to a file
torch.save(image_representation, 'image_representation.pth')

In [10]:
# 4.2 Step 2: Building the Classifier

import torch
import torch.nn as nn

# Define a simple fully connected neural network
class ImageClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ImageClassifier, self).__init__()

        # First fully connected layer
        self.fc1 = nn.Linear(input_dim, hidden_dim)

        # Activation function
        self.relu = nn.ReLU()

        # Second fully connected layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Forward pass
    def forward(self, x):
        x = self.fc1(x)  # First layer
        x = self.relu(x)  # ReLU activation
        x = self.fc2(x)  # Second layer
        return x

# Initialize the model
input_dim = 512
hidden_dim = 256
output_dim = 1

# Create the untrained model
model = ImageClassifier(input_dim, hidden_dim, output_dim)

In [49]:
# 4.3 Step 3: Training and Evaluation
# Dev set results

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Number of epochs for training
num_epochs = 10

# Example data
image_embeddings = torch.randn(100, 768)
labels = torch.randint(0, 2, (100, 1), dtype=torch.float32)

# Adjust the input dimension of the model to match the image_embeddings shape
# Create a new model with the correct input dimension
input_dim = 768
hidden_dim = 256
output_dim = 1
model = ImageClassifier(input_dim, hidden_dim, output_dim)

# Set the model to training mode
model.train()

# Training loop
for epoch in range(num_epochs):
    optimizer.zero_grad()

    # Forward pass
    outputs = model(image_embeddings)

    # Compute the loss
    loss = criterion(outputs, labels)

    # Backward pass
    loss.backward()
    optimizer.step()

    # Print the loss after each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Set the model to evaluation mode for evaluation
model.eval()

# Evaluation on the dev set
with torch.no_grad():
    dev_outputs = model(image_embeddings)

    # Apply sigmoid to convert logits to probabilities and round to get predictions (0 or 1)
    dev_preds = torch.sigmoid(dev_outputs).round()

# Compute evaluation metrics
accuracy = accuracy_score(labels, dev_preds)
precision = precision_score(labels, dev_preds)
recall = recall_score(labels, dev_preds)
f1 = f1_score(labels, dev_preds)
auc_roc = roc_auc_score(labels, torch.sigmoid(dev_outputs))

# Print evaluation results
print(f"Image Model - Dev Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")

Epoch 1/10, Loss: 0.7023346424102783
Epoch 2/10, Loss: 0.7023346424102783
Epoch 3/10, Loss: 0.7023346424102783
Epoch 4/10, Loss: 0.7023346424102783
Epoch 5/10, Loss: 0.7023346424102783
Epoch 6/10, Loss: 0.7023346424102783
Epoch 7/10, Loss: 0.7023346424102783
Epoch 8/10, Loss: 0.7023346424102783
Epoch 9/10, Loss: 0.7023346424102783
Epoch 10/10, Loss: 0.7023346424102783
Image Model - Dev Set Results:
Accuracy: 0.5
Precision: 0.4942528735632184
Recall: 0.8775510204081632
F1-Score: 0.6323529411764706
AUC-ROC: 0.5214085634253701


In [83]:
# Save the embeddings to a file
torch.save(model.state_dict(), 'image_classifier.pth')

In [51]:
# Load image embeddings
image_embeddings_path = '/content/image_representation.pth'
test_image_embeddings = torch.load(image_embeddings_path)

  test_image_embeddings = torch.load(image_embeddings_path)


In [55]:
# Image-only / Test set results

test_image_embeddings = torch.randn(100, 512)  # Replace with actual test image embeddings

# Set model to evaluation mode
image_model.eval()

# Get predictions
with torch.no_grad():
    test_image_outputs = image_model(test_image_embeddings)
    test_image_preds = torch.sigmoid(test_image_outputs).round().squeeze()

# Convert to numpy for sklearn metrics
test_image_preds = test_image_preds.cpu().numpy()

# Calculate and print evaluation metrics
accuracy = accuracy_score(test_labels_np, test_image_preds)
precision = precision_score(test_labels_np, test_image_preds)
recall = recall_score(test_labels_np, test_image_preds)
f1 = f1_score(test_labels_np, test_image_preds)
auc_roc = roc_auc_score(test_labels_np, torch.sigmoid(test_image_outputs).cpu().numpy())

print(f"Image Model - Test Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")


Image Model - Test Set Results:
Accuracy: 0.48
Precision: 0.4507042253521127
Recall: 0.7111111111111111
F1-Score: 0.5517241379310345
AUC-ROC: 0.534949494949495


In [84]:
# 5 Part 3: Multimodal Classification
# 5.1 Early Fusion (Feature-Level Fusion)
# 5.1.1 Step 1: Extract Text and Image Features

# Loading the saved text and image features
text_features = torch.load('/content/sentence_embeddings.pt')
image_features = torch.load('/content/image_representation.pth')


  text_features = torch.load('/content/sentence_embeddings.pt')  # Shape: [num_samples, text_feature_size]
  image_features = torch.load('/content/image_representation.pth')  # Shape: [num_samples, image_feature_size]


In [93]:
# 5.1.2 Step 2: Concatenate Features and Train

import torch
from torch.utils.data import DataLoader, TensorDataset

# Check if the number of samples is the same for all tensors
num_samples = min(text_features.shape[0], image_features.shape[0], labels.shape[0])

# Slice the tensors to ensure they have the same number of samples
text_features = text_features[:num_samples]
image_features = image_features[:num_samples]
labels = labels[:num_samples]

dataset = TensorDataset(text_features, image_features, labels)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Example training loop for the multimodal classifier
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    for text_batch, image_batch, labels_batch in data_loader:
        optimizer.zero_grad()

        # Get features from pre-extracted text and image features
        labels = labels_batch.long()

        # Forward pass
        logits = model(text_batch, image_batch)

        # Compute loss and backpropagate
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.47857218980789185
Epoch 2, Loss: 0.006389427464455366
Epoch 3, Loss: 0.00044764988706447184
Epoch 4, Loss: 5.054346183896996e-05
Epoch 5, Loss: 7.748573807475623e-06
Epoch 6, Loss: 1.4305104514278355e-06
Epoch 7, Loss: 3.576278118089249e-07
Epoch 8, Loss: 1.1920928244535389e-07
Epoch 9, Loss: 0.0
Epoch 10, Loss: 0.0


In [21]:
# Early fusion training and evaluation
# Dev set results

import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the multimodal model for early fusion (concatenation of text and image features)
class MultimodalClassifier(nn.Module):
    def __init__(self, text_input_size, image_input_size, hidden_size=256):
        super(MultimodalClassifier, self).__init__()
        self.fc1 = nn.Linear(text_input_size + image_input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, text_features, image_features):
        combined_features = torch.cat((text_features, image_features), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        output = self.fc2(x)
        return output

# Define the model
text_input_size = 768
image_input_size = 512
model = MultimodalClassifier(text_input_size, image_input_size)

# Example batch of features
text_features = torch.randn(32, 768)
image_features = torch.randn(32, 512)
dev_labels = torch.randint(0, 2, (32,))

# Forward pass through the multimodal model
combined_output = model(text_features, image_features)

# Convert logits to binary predictions using sigmoid and rounding
dev_preds = torch.sigmoid(combined_output).round().squeeze().detach().cpu().numpy()

# Convert dev_labels to numpy array and flatten
dev_labels_np = dev_labels.squeeze().cpu().numpy()

# Check for shape mismatch and print the shapes
if dev_labels_np.shape != dev_preds.shape:
    print(f"Shape mismatch: dev_labels shape {dev_labels_np.shape}, dev_preds shape {dev_preds.shape}")
else:
    # Calculate evaluation metrics
    accuracy = accuracy_score(dev_labels_np, dev_preds)
    precision = precision_score(dev_labels_np, dev_preds)
    recall = recall_score(dev_labels_np, dev_preds)
    f1 = f1_score(dev_labels_np, dev_preds)
    auc_roc = roc_auc_score(dev_labels_np, torch.sigmoid(combined_output).detach().cpu().numpy().flatten())

print(f"Early Fusion Model - Dev Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")

Early Fusion Model - Dev Set Results:
Accuracy: 0.46875
Precision: 0.5
Recall: 0.8823529411764706
F1-Score: 0.6382978723404256
AUC-ROC: 0.5058823529411764


In [19]:
# Early fusion / Test set results

# Set model to evaluation mode
model.eval()

# Get predictions
with torch.no_grad():
    # Pass text and image features separately to the model
    test_combined_output = model(test_text_embeddings, test_image_embeddings)
    test_combined_preds = torch.sigmoid(test_combined_output).round().squeeze()

# Convert to numpy for sklearn metrics
test_combined_preds = test_combined_preds.cpu().numpy()

# Calculate and print evaluation metrics
accuracy = accuracy_score(test_labels_np, test_combined_preds)
precision = precision_score(test_labels_np, test_combined_preds)
recall = recall_score(test_labels_np, test_combined_preds)
f1 = f1_score(test_labels_np, test_combined_preds)
auc_roc = roc_auc_score(test_labels_np, torch.sigmoid(test_combined_output).cpu().numpy())

print(f"Early Fusion Model - Test Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")


Early Fusion Model - Test Set Results:
Accuracy: 0.56
Precision: 0.6078431372549019
Recall: 0.5636363636363636
F1-Score: 0.5849056603773585
AUC-ROC: 0.5931313131313132


In [26]:
# 5.2 Late Fusion (Decision-Level Fusion)
# Late fusion training and evaluation
# Dev set results

import torch

# Example outputs for batch size 32
batch_size = 32
text_logits = torch.randn(batch_size, 1)
image_logits = torch.randn(batch_size, 1)

# Combine the logits (late fusion)
combined_logits = (text_logits + image_logits) / 2

# Convert combined logits to predicted classes (for binary classification)
predictions = torch.sigmoid(combined_logits).round()

# Example ground truth labels (binary classification: 0 or 1)
dev_labels = torch.randint(0, 2, (batch_size, 1))

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert tensors to numpy arrays for evaluation
dev_labels_np = dev_labels.numpy()
predictions_np = predictions.numpy()

accuracy = accuracy_score(dev_labels_np, predictions_np)
precision = precision_score(dev_labels_np, predictions_np)
recall = recall_score(dev_labels_np, predictions_np)
f1 = f1_score(dev_labels_np, predictions_np)

# Print the results
print(f"Late Fusion Model - Dev Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")

Late Fusion Model - Dev Set Results:
Accuracy: 0.5625
Precision: 0.5294117647058824
Recall: 0.6
F1-Score: 0.5625
AUC-ROC: 1.0


In [33]:
# Late fusion / Test set results

# Assuming your ImageClassifier class definition is as follows:
class ImageClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ImageClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate the ImageClassifier with appropriate dimensions
image_model = ImageClassifier(input_dim=512, hidden_dim=256, output_dim=1)
image_representations = torch.load('/content/image_representation.pth')

# Set both models to evaluation mode
text_model.eval()
image_model.eval()

# Get predictions from both models
with torch.no_grad():
    test_text_logits = text_model(test_text_embeddings)
    test_image_logits = image_model(test_image_embeddings)

    # Average the logits for late fusion
    test_combined_logits = (test_text_logits + test_image_logits) / 2
    test_combined_preds = torch.sigmoid(test_combined_logits).round().squeeze()

# Convert to numpy for sklearn metrics
test_combined_preds = test_combined_preds.cpu().numpy()

# Calculate and print evaluation metrics
accuracy = accuracy_score(test_labels_np, test_combined_preds)
precision = precision_score(test_labels_np, test_combined_preds)
recall = recall_score(test_labels_np, test_combined_preds)
f1 = f1_score(test_labels_np, test_combined_preds)
auc_roc = roc_auc_score(test_labels_np, torch.sigmoid(test_combined_logits).cpu().numpy())

print(f"Late Fusion Model - Test Set Results:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}\nAUC-ROC: {auc_roc}")


Late Fusion Model - Test Set Results:
Accuracy: 0.54
Precision: 0.5849056603773585
Recall: 0.5636363636363636
F1-Score: 0.5740740740740741
AUC-ROC: 0.5208080808080807


  image_representations = torch.load('/content/image_representation.pth')
