In this notebook, a **VQA** model is implemented using **PyTorch** library.

- Question features are extracted using
  - **Word2Vec or FastText Embeddings**
  - **LSTM layers**
- Image features are available in the dataset.
- The question and image features are fused with
  - **Cross attention** (with Multihead attention)
- The correct answer is predicted with a Dense layer.

**Best Validation Accuracy: 0.836**


# Imports

In [34]:
import gensim.downloader as api
import pandas as pd
import torch
import pickle
from torch import nn
import torchtext
import numpy as np
import json
# from google.colab import drive

# Loading data

## Connecting to drive

In [35]:
# drive.mount('/content/gdrive/', force_remount=True)
# base_path = '/content/gdrive/My Drive/iust/miniVQA/'
base_path = '/kaggle/input/minivqaiust/'
output_path = '/kaggle/working/'

## Setting up GPU

In [36]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
device

device(type='cuda')

## Reading data

### Answers

In [37]:
all_answers = [ 'surfboard', 'eating', 'cake', 'table', 'hat', 'giraffe', 'broccoli', 'woman', 'sunny', 'apple']

### Image features

In [38]:
with open(base_path + 'image_features.pickle', 'rb') as f:
    image_features = pickle.load(f)

### Questions

In [39]:
with open(base_path + 'image_question.json', 'r') as f:
  img_to_q_dict = json.load(f)
  questions = []
  for img_id, img_qs in img_to_q_dict.items():
    for img_q in img_qs:
      q_id, q_text = img_q
      questions.append({
        'q_id': q_id,
        'q_text': q_text,
        'img_id': img_id
      })

questions = sorted(questions, key= lambda q: q['q_id'])

### Subsets

In [40]:
train_csv = pd.read_csv(base_path + 'train.csv', index_col="question_id").sort_index()
train_csv.head()

train_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in train_csv.index.values]
train_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in train_csv.index.values]


train_q = train_csv["question_text"].values.tolist()
train_a = torch.from_numpy(train_csv["label"].values)


In [41]:
valid_csv = pd.read_csv(base_path + 'val.csv', index_col="question_id").sort_index()
valid_csv.head()

valid_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in valid_csv.index.values]
valid_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in valid_csv.index.values]


valid_q = valid_csv["question_text"].values.tolist()
valid_a = torch.from_numpy(valid_csv["label"].values)


In [42]:
test_csv = pd.read_csv(base_path + 'test.csv', index_col="question_id").sort_index()
test_csv.head()

test_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in test_csv.index.values]
test_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in test_csv.index.values]


test_q = test_csv["question_text"].values.tolist()

# Create word embeddings layer

## Download model

In [43]:
embedding_model_name = "word2vec-google-news-300"
# embedding_model_name = "fasttext-wiki-news-subwords-300"

In [46]:
embedding_model_name
embedding_model = api.load(embedding_model_name)

## Preprocess questions

In [48]:
max_length = 8

In [49]:
class WordEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, text):
        return self.embedding(text)

In [50]:
# Tokenize
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# Create embedding layer
embed_size = len(embedding_model.get_vector('hello'))
word_embeddings = WordEmbeddings(
    vocab_size = len(embedding_model.index_to_key) + 1,
    embed_dim = embed_size,
)
word_embeddings.embedding.weight.data[0] = torch.zeros(embed_size)
word_embeddings.embedding.weight.data[1:] = torch.from_numpy(embedding_model.vectors)

In [51]:
def encode(x):
    return [embedding_model.get_index(token, default=-1) + 1 for token in tokenizer(x)]

In [53]:
def padify(xs, l = max_length):
    encoded_x = [encode(x) for x in xs]
    return torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in encoded_x])

In [54]:
# Apply on train
train_q_embeddings = word_embeddings(
    padify(train_q)
)
print('Train q embeddings size:', train_q_embeddings.shape)

train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])
print('Train image features shape:', train_img.shape)

Train q embeddings size: torch.Size([780, 8, 300])
Train image features shape: torch.Size([780, 512])


In [55]:
# Apply on valid
valid_q_embeddings = word_embeddings(
    padify(valid_q)
)
print('Valid q embeddings size:', valid_q_embeddings.shape)

valid_img = torch.Tensor([image_features[img_id] for img_id in valid_csv["image_id"].values])
print('Valid image features shape:', valid_img.shape)

Valid q embeddings size: torch.Size([110, 8, 300])
Valid image features shape: torch.Size([110, 512])


In [56]:
# Apply on test
test_q_embeddings = word_embeddings(
    padify(test_q)
)
print('Test q embeddings size:', test_q_embeddings.shape)

test_img = torch.Tensor([image_features[img_id] for img_id in test_csv["image_id"].values])
print('Test image features shape:', test_img.shape)

Test q embeddings size: torch.Size([110, 8, 300])
Test image features shape: torch.Size([110, 512])


## Create dataset and dataloader

In [57]:
train_dataset = torch.utils.data.TensorDataset(train_q_embeddings, train_img, train_a)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [58]:
valid_dataset = torch.utils.data.TensorDataset(valid_q_embeddings, valid_img, valid_a)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=True)

# Build model

In [59]:
# empty memory
del image_features
import gc
gc.collect

<function gc.collect(generation=2)>

In [60]:
class MiniVQAEncoder(nn.Module):
    def __init__(self, text_size, image_size, nhead=8, dropout=0.3, layer_norm_eps=1e-5):
        super(type(self), self).__init__()
        self.lstms = nn.LSTM(text_size, image_size, num_layers=2)

        self.cross_attn = nn.MultiheadAttention(
            embed_dim = image_size,
            num_heads = nhead,
            kdim = image_size,
            vdim = image_size,
            batch_first=True) # l:seq length n:batch size e:embed size

        self.norm1 = nn.LayerNorm(image_size, eps=layer_norm_eps)
        self.dropout1 = nn.Dropout(dropout)
#         self.linear1 = nn.Linear(image_size, 512)


    def forward(self, text, image):
        text_features = self.lstms(text)[0]
        src2 = self.cross_attn(text_features, image, image)[0]
        src = text_features + self.dropout1(src2)
        src = self.norm1(src)
#         src= self.linear1(src)
        return src

In [61]:
class MiniVQADecoder(nn.Module):
    def __init__(self, features_size, nhead=8):
        super(type(self), self).__init__()
        self.norm = nn.BatchNorm1d(features_size)
        self.linears = nn.Sequential(
            nn.Linear(features_size, 10),
            nn.Tanh()
        )

    def forward(self, features):
        output = torch.flatten(features, start_dim=1)
        output = self.norm(output)
        output = self.linears(output)
        return nn.functional.softmax(output, dim=1)

In [62]:
class MiniVQA(nn.Module):
    def __init__(self, text_size, image_size):
        super(type(self), self).__init__()
        self.encoder = MiniVQAEncoder(text_size=text_size, image_size=image_size)
        self.decoder = MiniVQADecoder(512)

    def forward(self, text, image):
        enc = self.encoder(text, image)
        dec = self.decoder(enc)
        return dec

In [69]:
miniVQA = MiniVQA(embed_size , 64)
miniVQA.to(device)

MiniVQA(
  (encoder): MiniVQAEncoder(
    (lstms): LSTM(300, 64, num_layers=2)
    (cross_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.3, inplace=False)
  )
  (decoder): MiniVQADecoder(
    (norm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (linears): Sequential(
      (0): Linear(in_features=512, out_features=10, bias=True)
      (1): Tanh()
    )
  )
)

# Train model

## Define constants

In [70]:
learning_rate = 1e-3
weight_decay = 1e-5
epochs = 30

## Define train loop

In [71]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(miniVQA.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [66]:
def pred_val(model, dataloader):
  size = len(dataloader.dataset)
  correct = 0
  avg_loss = 0

  for batch, (text, image, y) in enumerate(dataloader):
    image_reshaped = torch.reshape(image, (image.shape[0], 8, image.shape[1]//8))
    pred = model(text.to(device), image_reshaped.to(device))
    loss = loss_fn(pred, y.to(device))
    output = [torch.argmax(o).item() for o in pred]
    correct += (torch.FloatTensor(output) == y).float().sum()
    avg_loss += loss.item()
  acc = correct / size
  return avg_loss, correct, acc

In [67]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    correct = 0
    avg_loss = 0

    for batch, (text, image, y) in enumerate(dataloader):
        image_reshaped = torch.reshape(image, (image.shape[0], 8, image.shape[1]//8))
        pred = model(text.to(device), image_reshaped.to(device))
        loss = loss_fn(pred, y.to(device))
        # Backpropagation
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        output = [torch.argmax(o).item() for o in pred]
        correct += (torch.FloatTensor(output) == y).float().sum()
        avg_loss += loss.item()
    avg_loss /= (size // 64 + 1)
    acc = correct / size
    val_loss, val_correct, val_acc = pred_val(model, valid_dataloader)
    print(f"training/ loss: {avg_loss:>7f} | accuracy: {acc}")
    print(f"val/ loss: {val_loss:>7f} | accuracy: {val_acc}")

In [72]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, miniVQA, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
training/ loss: 2.305539 | accuracy: 0.0923076942563057
val/ loss: 4.598548 | accuracy: 0.11818181723356247
Epoch 2
-------------------------------
training/ loss: 2.298064 | accuracy: 0.12051282078027725
val/ loss: 4.609141 | accuracy: 0.1090909093618393
Epoch 3
-------------------------------
training/ loss: 2.294344 | accuracy: 0.12820513546466827
val/ loss: 4.575342 | accuracy: 0.1818181872367859
Epoch 4
-------------------------------
training/ loss: 2.282018 | accuracy: 0.18589743971824646
val/ loss: 4.535497 | accuracy: 0.27272728085517883
Epoch 5
-------------------------------
training/ loss: 2.251470 | accuracy: 0.3730769157409668
val/ loss: 4.473119 | accuracy: 0.41818180680274963
Epoch 6
-------------------------------
training/ loss: 2.212642 | accuracy: 0.5782051086425781
val/ loss: 4.386770 | accuracy: 0.6272727251052856
Epoch 7
-------------------------------
training/ loss: 2.183817 | accuracy: 0.6435897350311279
val/ loss: 4.323

# Predict

In [None]:
test_image_reshaped = torch.reshape(test_img, (test_img.shape[0], 8, test_img.shape[1]//8))
pred = miniVQA(test_q_embeddings.to(device), test_image_reshaped.to(device))
output = np.array([torch.argmax(o).item() for o in pred], dtype='int64')

df = pd.DataFrame({
    'question_id': sorted(test_csv.index.values),
    'label': output
})
print(df.head())
df.to_csv(base_path + '/minivqa-v3.1-submission.csv', index=False)

# Save model

In [None]:
torch.save(miniVQA.state_dict(), base_path + 'minivqa1_weights.pth')
