In this notebook, a **VQA** model is implemented using **PyTorch** library.

- Question features are extracted using
  - **Word2Vec or FastText Embeddings**
  - **LSTM layers**
- Image features are available in the dataset.
- The question and image features are fused with
  - **Concatenation**
- The correct answer is predicted with a Dense layer.

**Best Validation Accuracy: 0.763**

# Imports

In [None]:
import gensim.downloader as api
import pandas as pd
import torch
import pickle
from torch import nn
import torchtext
import os
import collections
import numpy as np
import json
from google.colab import drive

# Loading data

## Connecting to drive

In [None]:
drive.mount('/content/gdrive/', force_remount=True)
base_path = '/content/gdrive/My Drive/iust/miniVQA/'

Mounted at /content/gdrive/


## Reading data

### Answers

In [None]:
all_answers = [ 'surfboard', 'eating', 'cake', 'table', 'hat', 'giraffe', 'broccoli', 'woman', 'sunny', 'apple']

### Image features

In [None]:
with open(base_path + 'image_features.pickle', 'rb') as f:
    image_features = pickle.load(f)

### Questions

In [None]:
with open(base_path + 'image_question.json', 'r') as f:
  img_to_q_dict = json.load(f)
  questions = []
  for img_id, img_qs in img_to_q_dict.items():
    for img_q in img_qs:
      q_id, q_text = img_q
      questions.append({
        'q_id': q_id,
        'q_text': q_text,
        'img_id': img_id
      })

questions = sorted(questions, key= lambda q: q['q_id'])

### Subsets

In [None]:
train_csv = pd.read_csv(base_path + 'train.csv', index_col="question_id").sort_index()
train_csv.head()

train_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in train_csv.index.values]
train_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in train_csv.index.values]


train_q = train_csv["question_text"].values.tolist()
train_a = torch.from_numpy(train_csv["label"].values)


In [None]:
valid_csv = pd.read_csv(base_path + 'val.csv', index_col="question_id").sort_index()
valid_csv.head()

valid_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in valid_csv.index.values]
valid_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in valid_csv.index.values]


valid_q = valid_csv["question_text"].values.tolist()
valid_a = torch.from_numpy(valid_csv["label"].values)


In [None]:
test_csv = pd.read_csv(base_path + 'test.csv', index_col="question_id").sort_index()
test_csv.head()

test_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in test_csv.index.values]
test_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in test_csv.index.values]


test_q = test_csv["question_text"].values.tolist()

# Create word embeddings layer

## Download model

In [None]:
# embedding_model_name = "word2vec-google-news-300"
embedding_model_name = "fasttext-wiki-news-subwords-300"

In [None]:
embedding_model = api.load(embedding_model_name)



## Preprocess questions

In [None]:
class WordEmbeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, text):
        return self.embedding(text)

In [None]:
# Tokenize
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# Create embedding layer
embed_size = len(embedding_model.get_vector('hello'))
word_embeddings = WordEmbeddings(
    vocab_size = len(embedding_model.index_to_key) + 1,
    embed_dim = embed_size,
)
word_embeddings.embedding.weight.data[0] = torch.zeros(embed_size)
word_embeddings.embedding.weight.data[1:] = torch.from_numpy(embedding_model.vectors)

In [None]:
def encode(x):
  return [embedding_model.get_index(token, default=-1) + 1 for token in tokenizer(x)]

In [None]:
def padify(xs, l = 15):
    encoded_x = [encode(x) for x in xs]
    return torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in encoded_x])

In [None]:
# Apply on train
train_q_embeddings = word_embeddings(
    padify(train_q)
)
print('Train q embeddings size:', train_q_embeddings.shape)

train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])
print('Train image features shape:', train_img.shape)

Train q embeddings size: torch.Size([780, 15, 300])
Train image features shape: torch.Size([780, 512])


  train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])


In [None]:
# Apply on valid
valid_q_embeddings = word_embeddings(
    padify(valid_q)
)
print('Valid q embeddings size:', valid_q_embeddings.shape)

valid_img = torch.Tensor([image_features[img_id] for img_id in valid_csv["image_id"].values])
print('Valid image features shape:', valid_img.shape)

Valid q embeddings size: torch.Size([110, 15, 300])
Valid image features shape: torch.Size([110, 512])


In [None]:
# Apply on test
test_q_embeddings = word_embeddings(
    padify(test_q)
)
print('Test q embeddings size:', test_q_embeddings.shape)

test_img = torch.Tensor([image_features[img_id] for img_id in test_csv["image_id"].values])
print('Test image features shape:', test_img.shape)

Test q embeddings size: torch.Size([110, 15, 300])
Test image features shape: torch.Size([110, 512])


## Create dataset and dataloader

In [None]:
train_dataset = torch.utils.data.TensorDataset(train_q_embeddings, train_img, train_a)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
valid_dataset = torch.utils.data.TensorDataset(valid_q_embeddings, valid_img, valid_a)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=True)

# Build model

In [None]:
# empty memory
del embedding_model
del image_features
del questions
import gc
gc.collect

<function gc.collect(generation=2)>

In [None]:
class MiniVQA(nn.Module):
    def __init__(self, lstm_input_size, linear_features_size):
        super(type(self), self).__init__()
        self.lstms = nn.LSTM(lstm_input_size, linear_features_size, num_layers=1)

        self.linears = nn.Sequential(
            nn.Linear(linear_features_size * 15 + 512, 128),
            nn.Dropout(0.2),
            nn.BatchNorm1d(128),
            nn.Tanh(),
            nn.Linear(128, 10),
            nn.Tanh()
        )


    def forward(self, text, image):
        text_features = self.lstms(text)[0]
        text_features= torch.flatten(text_features, start_dim=1)
        features = torch.cat([text_features, image], dim=1)
        logits = self.linears(features)
        return nn.functional.softmax(logits, dim=1)


In [None]:
miniVQA = MiniVQA(embed_size , 512)
print(miniVQA)

MiniVQA(
  (lstms): LSTM(300, 512)
  (linears): Sequential(
    (0): Linear(in_features=8192, out_features=128, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Tanh()
    (4): Linear(in_features=128, out_features=10, bias=True)
    (5): Tanh()
  )
)


# Train model

## Define constants

In [None]:
learning_rate = 3e-4
weight_decay = 4e-5
epochs = 15

## Define train loop

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(miniVQA.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [32]:
def pred_val(model, dataloader):
  size = len(dataloader.dataset)
  correct = 0
  avg_loss = 0
  for batch, (text, image, y) in enumerate(dataloader):
    pred = model(text, image)
    loss = loss_fn(pred, y)
    output = [torch.argmax(o).item() for o in pred]
    correct += (torch.FloatTensor(output) == y).float().sum()
    avg_loss += loss.item()
  acc = correct / size
  return avg_loss, correct, acc

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    correct = 0
    avg_loss = 0
    for batch, (text, image, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(text, image)
        loss = loss_fn(pred, y)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        output = [torch.argmax(o).item() for o in pred]
        correct += (torch.FloatTensor(output) == y).float().sum()
        avg_loss += loss.item()

    avg_loss /= (size // 64 + 1)
    acc = correct / size
    val_loss, val_correct, val_acc = pred_val(miniVQA, valid_dataloader)
    print(f"training / loss: {avg_loss:>7f} | accuracy: {acc}")
    print(f"val / loss: {val_loss:>7f} | accuracy: {val_acc}")

In [33]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, miniVQA, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
training / loss: 2.286499 | accuracy: 0.19871795177459717
val / loss: 4.557486 | accuracy: 0.2545454502105713
Epoch 2
-------------------------------
training / loss: 2.268170 | accuracy: 0.31282052397727966
val / loss: 4.528966 | accuracy: 0.33636364340782166
Epoch 3
-------------------------------
training / loss: 2.251997 | accuracy: 0.4115384519100189
val / loss: 4.498886 | accuracy: 0.44545453786849976
Epoch 4
-------------------------------
training / loss: 2.238927 | accuracy: 0.44999998807907104
val / loss: 4.469762 | accuracy: 0.4727272689342499
Epoch 5
-------------------------------
training / loss: 2.221536 | accuracy: 0.550000011920929
val / loss: 4.442209 | accuracy: 0.4727272689342499
Epoch 6
-------------------------------
training / loss: 2.205818 | accuracy: 0.6128205060958862
val / loss: 4.416873 | accuracy: 0.6181818246841431
Epoch 7
-------------------------------
training / loss: 2.197567 | accuracy: 0.6769230961799622
val /

# Predict

In [34]:
pred = miniVQA(test_q_embeddings, test_img)
output = np.array([torch.argmax(o).item() for o in pred], dtype='int64')
df = pd.DataFrame({
    'question_id': sorted(test_csv.index.values),
    'label': output
})
print(df.head())
df.to_csv(base_path + '/minivqa1-submission.csv', index=False)

   question_id  label
0       144000      1
1       436017      1
2       706000      8
3      1497002      8
4      1518004      2


# Save model

In [35]:
torch.save(miniVQA.state_dict(), base_path + 'minivqa1_weights.pth')
