In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import torchvision.transforms as transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn

from transformers import BertTokenizer, DistilBertModel, VisualBertModel

import pandas as pd
import numpy as np
import ast, os

from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
batch_size = 4
device = torch.device('cuda:1')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def map_values(ratings, tags):
    for i, tag in enumerate(tags):
        if tag == 'concrete':
            ratings[i] = 0
        elif tag == 'middle':
            ratings[i] = 1
        else:
            ratings[i] = 2
    return ratings

class Multimodal_Dataset(Dataset):
    def __init__(self, words_file, image_file, tokenizer):
        self.words_file = words_file
        self.images = image_file
        self.tokenizer = tokenizer

        self.data = pd.read_csv(words_file)
        self.words = self.data['word'].to_list()
        self.encodings = self.tokenizer([word for word in self.words], add_special_tokens=True, padding='longest', return_tensors='pt')

        self.photos = self.data['photos'].apply(ast.literal_eval)
        self.labels = self.data['tag'].to_list()
        ratings = self.data['rating'].to_list()
        self.ratings = map_values(ratings, self.labels)

        transform_list = [
            transforms.Grayscale(1),
            transforms.Resize((32, 168)),
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ]
        self.transform = transforms.Compose(transform_list)

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        images = self.photos[idx]
        imgs = []
        for image in images:
          img_path = os.path.join('images/', image)
          img = Image.open(img_path).convert('RGB')
          img = self.transform(img)
          imgs.append(img)

        # #padding list of images, without this the dataloader results in errors
        while len(imgs) < 12:
            imgs.append(torch.zeros_like(imgs[0]))

        #imgs = torch.stack(imgs)
        #print(f"Word: {self.words[idx]}, Number of images: {len(images)}")

        item = {'word': self.words[idx], 'input_ids': self.encodings['input_ids'][idx], 'attn_mask': self.encodings['attention_mask'][idx], 'token_type_ids': self.encodings['token_type_ids'],'imgs': imgs, 'rating': self.ratings[idx],'label': self.labels[idx]}
        return item

In [3]:
data = Multimodal_Dataset('merged_data.csv', 'images', tokenizer=tokenizer)
data[5]

{'word': 'rack',
 'input_ids': tensor([  101, 14513,   102,     0,     0,     0,     0]),
 'attn_mask': tensor([1, 1, 1, 0, 0, 0, 0]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'imgs': [tensor([[[-0.3647, -0.3412, -0.3333,  ..., -0.2078, -0.1843, -0.2000],
           [-0.3255, -0.3098, -0.3020,  ..., -0.1373, -0.1216, -0.1529],
           [-0.2941, -0.2784, -0.2549,  ..., -0.1294, -0.0510, -0.0431],
           ...,
           [-0.7412, -0.7333, -0.7098,  ..., -0.5059, -0.5059, -0.5373],
           [-0.7098, -0.7020, -0.6706,  ..., -0.5294, -0.4745, -0.4667],
           [-0.6235, -0.6549, -0.6549,  ..., -0.5373, -0.5137, -0.5843]]]),
  tensor([[[-0.8275, -0.8588, -0.9059,  ..., -0.4431, -0.3961, -0.4353],
           [-0.7569, -0.7961, -0.8353,  ..., -0.1451, -0.3020, -0.4196],
          

In [5]:
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)

In [6]:
for i, batch in enumerate(dataloader):
    print(batch)
    break

{'word': ['noncontributing', 'summertime', 'essayist', 'unusualness'], 'input_ids': tensor([[  101,  2512,  8663, 18886,  8569,  3436,   102],
        [  101,  2621,  7292,   102,     0,     0,     0],
        [  101,  9491,  2923,   102,     0,     0,     0],
        [  101,  5866,  2791,   102,     0,     0,     0]]), 'attn_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],

**Textual BERT**

In [7]:
class TEXTUAL_BERT(nn.Module):
  def __init__(self):
      super(TEXTUAL_BERT, self).__init__()
      self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
      self.classifier = nn.Linear(self.bert.config.hidden_size, 3) #the number of out_features was set to 1 for a regression task

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids, attention_mask)
      predictions = self.classifier(outputs.last_hidden_state[:, 0, :])

      return predictions

In [8]:
loss_fn = nn.CrossEntropyLoss()

In [9]:
model = TEXTUAL_BERT()#.to(device)

model.eval()
total_loss = 0
predictions = []
gold_labels = []
correct = 0
samples = 0

with torch.no_grad():
    for batch in dataloader:
        input_ids = torch.Tensor(batch['input_ids']).long()#.to(device)
        attn_masks = torch.Tensor(batch['attn_mask']).long()#.to(device)
        gold_label = batch['rating']#.to(device)
    
        outputs = model(input_ids, attn_masks)

        gold_labels.extend(gold_label)

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

        _, predicted_labels = torch.max(outputs, dim=1)
        #print(gold_label.size(), predicted_labels.size())
        predictions.extend(predicted_labels)
        correct += (predicted_labels == gold_label).sum().item()
        samples += len(gold_label)

average_loss = total_loss / len(dataloader)
print(f'Average Loss: {average_loss}')


Average Loss: 1.1360401781400045


In [10]:
accuracy = correct / samples
accuracy

0.3333333333333333

In [11]:
report = classification_report(gold_labels, predictions)

print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.33      1.00      0.50       100
           2       0.00      0.00      0.00       100

    accuracy                           0.33       300
   macro avg       0.11      0.33      0.17       300
weighted avg       0.11      0.33      0.17       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# mse = mean_squared_error(gold_labels, predictions, squared=False)
# mae = mean_absolute_error(gold_labels, predictions)

In [10]:
# print(f"MSE: {mse.item():.4f}")
# print(f"MAE: {mae.item():.4f}")

RMSE: 3.1823
MAE: 2.9540


**Visual BERT**

In [24]:
class VISUAL_BERT(nn.Module):
    def __init__(self):
        super(VISUAL_BERT, self).__init__()
        self.bert_model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.classifier = nn.Linear(self.bert_model.config.hidden_size, 1)

        self.object_detector = fasterrcnn_resnet50_fpn(pretrained=True)

    def forward(self, input_ids, attn_masks, token_type_ids, visual_embeddings):
    
        visual_token_type_ids = torch.ones(visual_embeddings.shape[:-1], dtype=torch.long).to(device)
        visual_attention_mask = torch.ones(visual_embeddings.shape[:-1], dtype=torch.float).to(device)

        outputs = self.bert_model(input_ids=input_ids, attention_mask=attn_masks, token_type_ids=token_type_ids, visual_embeds=visual_embeddings, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids)
        predictions = self.classifier(outputs.last_hidden_state[:, 0, :])

        return predictions

In [48]:
detector = fasterrcnn_resnet50_fpn(pretrained=True)

def get_features(image_batch):
    feature_extractor = nn.Sequential(*list(detector.backbone.children())[:-2])
    visual_embeddings = []
    for images in image_batch:
        image_embeddings = []
        for image in images:
            visual_embedding = feature_extractor(image)
            image_embeddings.append(visual_embedding)
        visual_embeddings.append(torch.stack(image_embeddings))
    return visual_embeddings



In [52]:
for batch in dataloader:
    images = batch['imgs']
    print(len(get_features(images)[0]))
    break

4


In [26]:
language_and_vision_model = VISUAL_BERT().to(device)

language_and_vision_model.eval()
total_loss = 0
predictions = []
gold_labels = []

with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        print(input_ids.size())
        attn_masks = batch['attn_mask'].to(device)
        token_type_ids = torch.mean(batch['token_type_ids'].float(), dim=1).to(device)
        token_type_ids = token_type_ids.long()
        print(token_type_ids.size())
        images = [torch.mean(img, 0).to(device) for img in batch['imgs']]
        #print(len(batch['imgs']))
        gold_label = batch['rating'].to(device)
        #print(gold_label.size())
        outputs = language_and_vision_model(input_ids, attn_masks, token_type_ids, images)

        predictions.extend(outputs.cpu().numpy())
        gold_labels.extend(gold_label.cpu().numpy())

        loss = loss_fn(outputs, gold_label)
        total_loss += loss.item()

average_loss = total_loss / len(dataloader)
print(f'Average Loss: {average_loss}')

torch.Size([4, 7])
torch.Size([4, 7])
visual embeddings 12
tensor([[[ 0.2529,  0.2353,  0.2510,  ...,  0.3314,  0.3510,  0.3471],
         [ 0.2784,  0.2431,  0.2078,  ...,  0.3176,  0.3216,  0.3216],
         [ 0.2471,  0.2373,  0.2373,  ...,  0.2941,  0.2941,  0.2902],
         ...,
         [-0.0059, -0.0294,  0.0216,  ..., -0.2922, -0.2451, -0.2020],
         [ 0.0078, -0.0608, -0.0549,  ..., -0.2588, -0.2216, -0.1882],
         [-0.0706, -0.1373, -0.1216,  ..., -0.1765, -0.1882, -0.2176]],

        [[ 0.0745, -0.0020, -0.0235,  ...,  0.5490,  0.5490,  0.5510],
         [ 0.0451,  0.0235,  0.0118,  ...,  0.5392,  0.5431,  0.5431],
         [ 0.0255,  0.0118,  0.0059,  ...,  0.5020,  0.5078,  0.5137],
         ...,
         [ 0.1451,  0.1510,  0.1510,  ...,  0.4373,  0.4412,  0.4255],
         [ 0.2000,  0.2431,  0.2647,  ...,  0.4118,  0.4275,  0.3824],
         [ 0.2176,  0.2431,  0.2725,  ...,  0.2490,  0.2235,  0.2333]],

        [[ 0.0118,  0.0098,  0.0098,  ...,  0.0333,  0.03

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 4 but got size 12 for tensor number 1 in the list.

In [None]:
rmse = mean_squared_error(gold_labels, predictions, squared=False)
mae = mean_absolute_error(gold_labels, predictions)

In [None]:
print(f"RMSE: {rmse.item():.4f}")
print(f"MAE: {mae.item():.4f}")