In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torchvision


from transformers import BertTokenizer, DistilBertModel, VisualBertModel

import pandas as pd
import numpy as np
import ast, os, cv2

from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

detector = torchvision.models.resnet50(pretrained=True)
detector = torch.nn.Sequential(*list(detector.children())[:-1])
detector.eval()

batch_size = 4
device = torch.device('cuda:1')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def get_features(image_list):
    image_list = torch.stack(image_list)
    vis_embeddings = detector(image_list)
    return vis_embeddings

def map_values(ratings, tags):
    for i, tag in enumerate(tags):
        if tag == 'concrete':
            ratings[i] = 0
        elif tag == 'middle':
            ratings[i] = 1
        else:
            ratings[i] = 2
    return ratings

class Multimodal_Dataset(Dataset):
    def __init__(self, words_file, image_file, tokenizer, regression=False):
        self.words_file = words_file
        self.images = image_file
        self.tokenizer = tokenizer

        self.data = pd.read_csv(words_file)
        self.words = self.data['word'].to_list()
        self.encodings = self.tokenizer([word for word in self.words], add_special_tokens=True, padding='max_length', max_length=12,return_tensors='pt')

        self.photos = self.data['photos'].apply(ast.literal_eval)
        self.labels = self.data['tag'].to_list()
        if regression == True:
            self.ratings = self.data['rating'].to_list()
        else:
            ratings = self.data['rating'].to_list()
            self.ratings = map_values(ratings, self.labels)

        transform_list = [
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ]
        self.transform = transforms.Compose(transform_list)

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        images = self.photos[idx]
        imgs = []
        for image in images:
          img_path = os.path.join('images/', image)
          img = Image.open(img_path)
          img = img.convert('RGB')
          img = self.transform(img)
          
          imgs.append(img)

        #padding list of images
        while len(imgs) < 12:
            imgs.append(torch.zeros_like(imgs[0]))

        embeddings = get_features(imgs)

        item = {'word': self.words[idx], 'input_ids': self.encodings['input_ids'][idx], 'attn_mask': self.encodings['attention_mask'][idx], 'token_type_ids': self.encodings['token_type_ids'], 'visual_embeddings': embeddings, 'rating': self.ratings[idx],'label': self.labels[idx]}
        return item

In [7]:
data = Multimodal_Dataset('merged_data.csv', 'images', tokenizer=tokenizer)
data[5]['visual_embeddings'].size()

torch.Size([12, 2048, 1, 1])

In [8]:
len(data)

300

In [9]:
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

**Textual BERT**

In [5]:
class TEXTUAL_BERT(nn.Module):
  def __init__(self, num_of_labels):
      super(TEXTUAL_BERT, self).__init__()
      self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
      self.classifier = nn.Linear(self.bert.config.hidden_size, num_of_labels)

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids, attention_mask)
      predictions = self.classifier(outputs.last_hidden_state[:, 0, :])

      return predictions

In [6]:
loss_fn = nn.CrossEntropyLoss()

In [7]:
import tqdm

In [8]:
model = TEXTUAL_BERT(3).to(device)

model.eval()
total_loss = 0
predictions = []
gold_labels = []
misclassifications = []

with torch.no_grad():
    for batch in tqdm.tqdm(dataloader):

        #embed(); raise
        input_ids = batch['input_ids'].long().to(device)
        attn_masks = batch['attn_mask'].long().to(device)
        gold_label = batch['rating'].to(device)
        outputs = model(input_ids, attn_masks)
        
        gold_labels.extend(gold_label.cpu().numpy())

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        _, predicted_labels = torch.max(outputs, dim=1)
        
        predictions.extend(predicted_labels.cpu().numpy())

        for i in range(len(predicted_labels)):
            if predicted_labels[i] != gold_label[i]:
                misclassification = f"{batch['word'][i]} predicted as {predicted_labels[i]} instead of {gold_label[i]}"
                misclassifications.append(misclassification)
    
average_loss = total_loss / len(dataloader)
print(f'Average Loss: {average_loss}')


100%|██████████| 75/75 [01:52<00:00,  1.50s/it]

Average Loss: 1.115250007311503





In [9]:
report = classification_report(gold_labels, predictions)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.34      0.38       100
           1       0.00      0.00      0.00       100
           2       0.36      0.80      0.50       100

    accuracy                           0.38       300
   macro avg       0.26      0.38      0.29       300
weighted avg       0.26      0.38      0.29       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
class Just_Words_Dataset(Dataset):
    def __init__(self, words_file, tokenizer, regression=False):
        self.words_file = words_file
        
        self.tokenizer = tokenizer

        self.data = pd.read_csv(words_file)
        self.words = self.data['word'].to_list()
        self.encodings = self.tokenizer([word for word in self.words], add_special_tokens=True, padding='longest', return_tensors='pt')
        
        self.labels = self.data['tag'].to_list()
        if regression == True:
            self.ratings = self.data['rating'].to_list()
        else:
            ratings = self.data['rating'].to_list()
            self.ratings = map_values(ratings, self.labels)

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):

        item = {'word': self.words[idx], 'input_ids': self.encodings['input_ids'][idx], 'attn_mask': self.encodings['attention_mask'][idx], 'token_type_ids': self.encodings['token_type_ids'], 'rating': self.ratings[idx],'label': self.labels[idx]}
        
        return item

In [9]:
def split_dataset(csv_file):
    
    file = pd.read_csv(csv_file)
    
    print("total words:",len(file))
    randomized_data = file.sample(frac=1, random_state=42)
    print("total randomized words:",len(randomized_data))

    train = int(len(randomized_data) * 0.8)

    train_data = randomized_data[:train]
    print("total training set:",len(train_data))

    test_data = randomized_data[train:]
    print("total testing set:",len(test_data))

    train_filepath = "train_dataset.csv"
    test_filepath = "test_dataset.csv"
    train_data.to_csv(train_filepath, index=False)
    test_data.to_csv(test_filepath, index=False)

In [10]:
split_dataset('merged_data.csv')

total words: 300
total randomized words: 300
total training set: 240
total testing set: 60


In [39]:
train_data = Multimodal_Dataset('train_dataset.csv', 'images', tokenizer=tokenizer)
test_data = Multimodal_Dataset('test_dataset.csv', 'images', tokenizer=tokenizer)

In [40]:
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=False)

In [41]:
for batch in train_dataloader:
    print(batch)
    break

{'word': ['simplification', 'spotter', 'peacetime', 'hill', 'decontamination', 'deflator', 'diatribe', 'backhandedness'], 'input_ids': tensor([[  101, 21934, 24759,  9031,   102,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  3962,  3334,   102,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  3521,  7292,   102,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  2940,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101, 21933, 12380, 22311,  3508,   102,     0,     0,     0,     0,
             0,     0],
        [  101, 13366, 20051,  2953,   102,     0,     0,     0,     0,     0,
             0,     0],
        [  101, 22939, 18886,  4783,   102,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  2067, 11774,  2098,  2791,   102,     0,     0,     0,     0,
             0,     0]]), 'attn_mask': tensor([[1, 1, 1, 1, 1, 0,

In [87]:
model = TEXTUAL_BERT(3).to(device)
epochs = 20
model.train()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    total_loss = 0
    for i, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].long().to(device)
        attn_masks = batch['attn_mask'].long().to(device)
        gold_label = batch['rating'].to(device)
    
        outputs = model(input_ids, attn_masks)

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        print("epoch:",epoch, "loss:", total_loss/(i+1), end='\r')

        loss.backward()

        optimizer.step()

        optimizer.zero_grad()


epoch: 19 loss: 1.1007295115788778

In [88]:
total_loss = 0
predictions = []
gold_labels = []

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = torch.Tensor(batch['input_ids']).long().to(device)
        attn_masks = torch.Tensor(batch['attn_mask']).long().to(device)
        gold_label = batch['rating'].to(device)

        outputs = model(input_ids, attn_masks)
       
        gold_labels.extend(gold_label.cpu().numpy())

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        _, predicted_labels = torch.max(outputs, dim=1)
        
        predictions.extend(predicted_labels.cpu().numpy())

average_loss = total_loss / len(test_dataloader)
print(f'Average Loss: {average_loss}')

Average Loss: 1.0994298897291486


In [89]:
report = classification_report(gold_labels, predictions)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       371
           1       0.00      0.00      0.00       414
           2       0.35      1.00      0.51       415

    accuracy                           0.35      1200
   macro avg       0.12      0.33      0.17      1200
weighted avg       0.12      0.35      0.18      1200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
regression_data = Multimodal_Dataset('merged_data.csv', 'images', tokenizer=tokenizer, regression=True)
regression_dataloader = DataLoader(regression_data, batch_size=batch_size, shuffle=True)

In [31]:
model = TEXTUAL_BERT(1).to(device)
total_loss = 0
predictions = []
gold_labels = []
loss_fn = nn.MSELoss()

model.eval()
with torch.no_grad():
    for batch in regression_dataloader:
        input_ids = torch.Tensor(batch['input_ids']).long().to(device)
        attn_masks = torch.Tensor(batch['attn_mask']).long().to(device)
        gold_label = batch['rating'].to(device)
    
        outputs = model(input_ids, attn_masks)
        
        gold_labels.extend(gold_label.cpu().numpy())

        loss = loss_fn(outputs, gold_label.unsqueeze(1))
        total_loss += loss.item()

        _, predicted_labels = torch.max(outputs, dim=1)
        #print(gold_label.size(), predicted_labels.size())
        predictions.extend(predicted_labels.cpu().numpy())

average_loss = total_loss / len(regression_dataloader)
print(f'Average Loss: {average_loss}')

Average Loss: 9.560116449147326


In [32]:
from scipy.stats import pearsonr

correlation_coefficient, p_value = pearsonr(gold_labels, predictions)
#The model sometimes predicts only one value and as a result the pearson correlation cannot be computed
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

Pearson Correlation Coefficient: nan
P-value: nan




In [33]:
mse = mean_squared_error(gold_labels, predictions, squared=False)
mae = mean_absolute_error(gold_labels, predictions)

print(f"MSE: {mse.item():.4f}")
print(f"MAE: {mae.item():.4f}")

MSE: 3.2834
MAE: 3.0588


**Visual BERT**

In [10]:
class VISUAL_BERT(nn.Module):
    def __init__(self):
        super(VISUAL_BERT, self).__init__()
        self.visual_bert = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.classifier = nn.Linear(self.visual_bert.config.hidden_size, 3)

    def forward(self, input_ids, attn_masks, token_type_ids, visual_embeddings):
        
        visual_token_type_ids = torch.ones(visual_embeddings.shape[:-1], dtype=torch.long).to(device)
        visual_attention_mask = torch.ones(visual_embeddings.shape[:-1], dtype=torch.float).to(device)
        
        outputs = self.visual_bert(input_ids=input_ids, attention_mask=attn_masks, token_type_ids=token_type_ids, visual_embeds=visual_embeddings, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        predictions = self.classifier(outputs.last_hidden_state[:, 0, :])

        return predictions

In [11]:
language_and_vision_model = VISUAL_BERT().to(device)

loss_fn = nn.CrossEntropyLoss()

language_and_vision_model.eval()
total_loss = 0
predictions_visual = []
gold_labels_visual = []

with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids'].long().to(device)
        
        attn_masks = batch['attn_mask'].long().to(device)
        token_type_ids = torch.mean(batch['token_type_ids'].float(), dim=1).to(device)
        token_type_ids = token_type_ids.long().to(device)
        
        visual_embeddings = batch['visual_embeddings'].to(device)
        visual_embeddings = visual_embeddings.squeeze(3)
        
        gold_label = batch['rating'].to(device)

        outputs = language_and_vision_model(input_ids, attn_masks, token_type_ids, visual_embeddings.squeeze(3))

        gold_labels_visual.extend(gold_label.cpu().numpy())
        
        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        _, predicted_labels = torch.max(outputs, dim=1)
        
        predictions_visual.extend(predicted_labels.cpu().numpy())

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

average_loss = total_loss / len(dataloader)
print(f'Average Loss: {average_loss}')

Average Loss: 2.251596725781759


In [12]:
report = classification_report(gold_labels_visual, predictions_visual)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.33      1.00      0.50       100
           2       0.00      0.00      0.00       100

    accuracy                           0.33       300
   macro avg       0.11      0.33      0.17       300
weighted avg       0.11      0.33      0.17       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
language_and_vision_model = VISUAL_BERT().to(device)
epochs = 2

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(language_and_vision_model.parameters(), lr=1e-4)

language_and_vision_model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].long().to(device)
        attn_masks = batch['attn_mask'].long().to(device)
        token_type_ids = torch.mean(batch['token_type_ids'].float(), dim=1).to(device)
        token_type_ids = token_type_ids.long().to(device)
        
        visual_embeddings = batch['visual_embeddings'].to(device)
        visual_embeddings = visual_embeddings.view(8, 12, 2048)
        
        gold_label = batch['rating'].to(device)
    
        outputs = language_and_vision_model(input_ids, attn_masks, token_type_ids, visual_embeddings)

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        print("epoch:",epoch, "loss:", total_loss/(i+1), end='\r')

        loss.backward()

        optimizer.step()

        optimizer.zero_grad()

epoch: 1 loss: 32.379753947257996

In [51]:
language_and_vision_model.eval()
total_loss = 0
predictions_visual = []
gold_labels_visual = []

with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids'].long().to(device)
        
        attn_masks = batch['attn_mask'].long().to(device)
        token_type_ids = torch.mean(batch['token_type_ids'].float(), dim=1).to(device)
        token_type_ids = token_type_ids.long().to(device)
        
        visual_embeddings = batch['visual_embeddings'].to(device)
        visual_embeddings = visual_embeddings.view(4, 12, 2048)
        
        gold_label = batch['rating'].to(device)

        outputs = language_and_vision_model(input_ids, attn_masks, token_type_ids, visual_embeddings)

        gold_labels_visual.extend(gold_label.cpu().numpy())
        
        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        _, predicted_labels = torch.max(outputs, dim=1)
        
        predictions_visual.extend(predicted_labels.cpu().numpy())

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

average_loss = total_loss / len(dataloader)
print(f'Average Loss: {average_loss}')

Average Loss: 1.885128639539083


In [52]:
report = classification_report(gold_labels_visual, predictions_visual)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.99      0.65       100
           1       0.73      0.08      0.14       100
           2       0.76      0.64      0.70       100

    accuracy                           0.57       300
   macro avg       0.66      0.57      0.50       300
weighted avg       0.66      0.57      0.50       300

