In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import models, transforms
from transformers import BertTokenizer, VisualBertForPreTraining
from PIL import Image
from tqdm import tqdm
import requests
from io import BytesIO
import json
import os

seed = 42

def request_image(url, transform):
    rsp = requests.get(url, stream=True)
    img = Image.open(BytesIO(rsp.content)).convert('RGB')
    return transform(img)

def load_csv_dataset(file_path, image_size=(224, 224)):
    df = pd.read_csv(file_path, nrows=1400)
    transform = transforms.Compose([transforms.Resize(image_size), transforms.ToTensor()])
    processed_data = []

    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        story1 = row['sent1']
        story2 = row['sent2']
        label = row['label']

        # Parse the URL field which is in string representation of list
        urls = json.loads(row['url'].replace("'", '"'))

        img_tensors = []
        for url in urls:
            try:
                img_tensors.append(request_image(url, transform))
            except Exception as e:
                # print(f"Error downloading image: {e}")
                continue

        if len(img_tensors) == 0:
            continue

        img_tensor = torch.stack(img_tensors)

        processed_data.append({'image': img_tensor, 'story1': story1, 'story2': story2, 'label': label})

    return processed_data


# Custom Dataset class
class RankingDataset(Dataset):
    def __init__(self, data, tokenizer, resnet_model, max_length=512, mask_probability=0.15):
        self.data = data
        self.tokenizer = tokenizer
        self.resnet_model = resnet_model
        self.max_length = max_length
        self.mask_probability = mask_probability

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Process image
        image_feature_list = []
        # Extract features using ResNet
        for image in item['image']:
            with torch.no_grad():
                # Inside the __getitem__ method
                image_features = self.resnet_model(image.unsqueeze(0))
                # print("Shape after ResNet:", image_features.shape)

                image_features = image_features.view(image_features.size(0), -1)
                # print("Shape after flattening:", image_features.shape)

                image_feature_list.append(image_features)
                #image_features = self.projection(image_features)
                #print("Shape after projection:", image_features.shape)

        image_features = torch.cat(image_feature_list, dim=0)

        # Process text
        story1 = item['story1']
        story2 = item['story2']

        # Tokenize text and prepare inputs for VisualBERT
        inputs_1 = self.tokenizer.encode_plus(
            story1,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        inputs_2 = self.tokenizer.encode_plus(
            story2,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # inputs_ids_1, labels_1 = self.mask_tokens(inputs_1['input_ids'].squeeze())
        # inputs_ids_2, labels_2 = self.mask_tokens(inputs_2['input_ids'].squeeze())

        inputs_ids_1 = inputs_1['input_ids'].squeeze()
        inputs_ids_2 = inputs_2['input_ids'].squeeze()

        # num_visual_tokens = len(image_feature_list)
        # visual_labels = torch.full((num_visual_tokens,), -100)  # MLM labels for visual tokens

        # labels_1 = torch.cat([labels_1, visual_labels], dim=0)
        # labels_2 = torch.cat([labels_2, visual_labels], dim=0)

        attention_mask_1 = inputs_1['attention_mask']
        attention_mask_2 = inputs_2['attention_mask']

        visual_attention_mask = torch.ones((image_features.size(0),), dtype=torch.long).unsqueeze(0)

        ranker_gap = item['label']

        return {
            'input_ids_1': inputs_ids_1,
            'input_ids_2': inputs_ids_2,
            'attention_mask_1': attention_mask_1,
            'attention_mask_2': attention_mask_2,
            'visual_embeds': image_features,
            'visual_attention_mask': visual_attention_mask,
            'ranker_gap': ranker_gap
        }

# Initialize components
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
resnet_model = models.resnet50(pretrained=True)
resnet_model = nn.Sequential(*list(resnet_model.children())[:-1])
resnet_model.eval()

In [None]:
ranking_data = load_csv_dataset('/content/drive/MyDrive/Colab Notebooks/project/VHED_url.csv')

In [10]:
import random
random.seed(seed)

random.shuffle(ranking_data)

train_data_ind = int(0.6 * len(ranking_data))
val_data_ind = int(0.8 * len(ranking_data))

train_data = ranking_data[:train_data_ind]
val_data = ranking_data[train_data_ind:val_data_ind]
test_data = ranking_data[val_data_ind:]

train_dataset = RankingDataset(train_data, tokenizer, resnet_model)
val_dataset = RankingDataset(val_data, tokenizer, resnet_model)
test_dataset = RankingDataset(test_data, tokenizer, resnet_model)

In [14]:
def collate_fn(batch):
    # Pad the visual embeddings to have the same sequence length
    image_features = pad_sequence([item['visual_embeds'] for item in batch], batch_first=True)

    # Get the maximum sequence length for visual_attention_mask
    max_seq_length = max(item['visual_attention_mask'].shape[1] for item in batch)

    # Pad the visual_attention_mask tensors
    visual_attention_mask = [torch.cat([item['visual_attention_mask'], torch.zeros(item['visual_attention_mask'].shape[0], max_seq_length - item['visual_attention_mask'].shape[1])], dim=1) for item in batch]

    # Similarly, you can pad other sequences if necessary.
    # Stack other values
    input_ids_1 = torch.stack([item['input_ids_1'] for item in batch])
    input_ids_2 = torch.stack([item['input_ids_2'] for item in batch])
    attention_mask_1 = torch.stack([item['attention_mask_1'] for item in batch])
    attention_mask_2 = torch.stack([item['attention_mask_2'] for item in batch])
    visual_attention_mask = torch.stack(visual_attention_mask)
    ranker_gap = torch.tensor([item['ranker_gap'] for item in batch])

    return {
        'input_ids_1': input_ids_1,
        'input_ids_2': input_ids_2,
        'attention_mask_1': attention_mask_1,
        'attention_mask_2': attention_mask_2,
        'visual_embeds': image_features,
        'visual_attention_mask': visual_attention_mask,
        'ranker_gap': ranker_gap
    }

In [20]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn, num_workers=8)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=8)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=8)

In [21]:
# Load your pre-trained VisualBert model
model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/project/epoch_8.pt'))
model.eval()  # Make sure the model is in evaluation mode

# Modify model for ranking task
# Add a linear layer to model for regression task
model.regression_layer = nn.Linear(model.config.hidden_size, 1)

# Define optimizer and loss function for ranking
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.MSELoss()  # Mean Squared Error Loss

# Training loop
def train_ranker(model, dataloader, optimizer, criterion):
    model.train()

    total_correct = 0
    total_sample = 0

    for batch in tqdm(dataloader):
        input_ids_1 = batch['input_ids_1']
        input_ids_2 = batch['input_ids_2']
        attention_mask_1 = batch['attention_mask_1']
        attention_mask_2 = batch['attention_mask_2']
        visual_embeds = batch['visual_embeds']
        visual_attention_mask = batch['visual_attention_mask']
        ranking_gap = batch['ranker_gap'].float()

        # Forward pass and loss computation for both story inputs
        outputs_1 = model(
            input_ids=input_ids_1,
            visual_embeds=visual_embeds,
            attention_mask=attention_mask_1,  # Add this
            visual_attention_mask=visual_attention_mask,  # Add this
            output_hidden_states=True
        )
        outputs_2 = model(
            input_ids=input_ids_2,
            visual_embeds=visual_embeds,
            attention_mask=attention_mask_2,  # Add this
            visual_attention_mask=visual_attention_mask,  # Add this
            output_hidden_states=True
        )

        # Use the new ranking layer to predict the ranking gap
        ranking_prediction_1 = model.regression_layer(outputs_1.hidden_states[-1][:, 0])
        ranking_prediction_2 = model.regression_layer(outputs_2.hidden_states[-1][:, 0])
        ranking_prediction = ranking_prediction_1 - ranking_prediction_2
        ranking_prediction = ranking_prediction.squeeze(-1)

        # compute accuracy
        for i in range(len(ranking_prediction)):
            if ranking_prediction[i] > 0 and ranking_gap[i] > 0:
                total_correct += 1
            elif ranking_prediction[i] < 0 and ranking_gap[i] < 0:
                total_correct += 1

        total_sample += ranking_prediction.shape[0]

        # Compute loss
        loss = criterion(ranking_prediction, ranking_gap)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Train Accuracy: {total_correct / total_sample}")
    print(f"Train Loss: {loss.item()}")

def evaluate_ranker(model, dataloader, criterion):
    model.eval()
    total_correct = 0
    total_sample = 0
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids_1 = batch['input_ids_1']
            input_ids_2 = batch['input_ids_2']
            attention_mask_1 = batch['attention_mask_1']
            attention_mask_2 = batch['attention_mask_2']
            visual_embeds = batch['visual_embeds']
            visual_attention_mask = batch['visual_attention_mask']
            ranking_gap = batch['ranker_gap'].float()
            # Forward pass
            outputs1 = model(
                input_ids=input_ids_1,
                visual_embeds=visual_embeds,
                attention_mask=attention_mask_1,  # Add this
                visual_attention_mask=visual_attention_mask,
                output_hidden_states=True
            )
            outputs2 = model(
                input_ids=input_ids_2,
                visual_embeds=visual_embeds,
                attention_mask=attention_mask_2,  # Add this
                visual_attention_mask=visual_attention_mask,
                output_hidden_states=True
            )
            #
            ranking_prediction_1 = model.regression_layer(outputs1.hidden_states[-1][:, 0])
            ranking_prediction_2 = model.regression_layer(outputs2.hidden_states[-1][:, 0])
            ranking_prediction = ranking_prediction_1 - ranking_prediction_2
            ranking_prediction = ranking_prediction.squeeze(-1)
            #
            loss = criterion(ranking_prediction, ranking_gap)
            total_loss += loss.item()
            #
            for i in range(len(ranking_prediction)):
                if ranking_prediction[i] > 0 and ranking_gap[i] > 0:
                    total_correct += 1

                elif ranking_prediction[i] < 0 and ranking_gap[i] < 0:
                    total_correct += 1
            total_sample += ranking_prediction.shape[0]
    print(f"Val Accuracy: {total_correct / total_sample}")
    print(f"Val Loss: {total_loss / total_sample}")

In [None]:
num_epochs = 8

save_path = '/content/drive/MyDrive/Colab Notebooks/project'

for epoch in range(num_epochs):
    train_ranker(model, train_dataloader, optimizer, criterion)
    evaluate_ranker(model, val_dataloader, criterion)
    print(f"Epoch {epoch} complete.")
    # Save the fine-tuned model
    torch.save(model.state_dict(), os.path.join(save_path, f'ranker_epoch_{epoch}.pt'))

In [None]:
finetuned_model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre')

finetuned_model.regression_layer = nn.Linear(finetuned_model.config.hidden_size, 1)

finetuned_model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/project/ranker_epoch_6.pt'))

finetuned_model.eval()

In [None]:
evaluate_ranker(finetuned_model, test_dataloader, criterion)