In this notebook, a **VQA** model is implemented using **PyTorch** library.

- Question features are extracted using
  - **DistilBert** (fine-tuned on the dataset)
- Image features are available in the dataset.
- The question and image features are fused with
  - **Concatenation**
- The correct answer is predicted with a Dense layer.

**Best Validation Accuracy: 0.927**

# Imports

In [1]:
!pip install transformers



In [2]:
import pandas as pd
import torch
import pickle
from torch import nn
import json
# from google.colab import drive
from transformers import DistilBertTokenizer, DistilBertModel



# Loading data

## Connecting to drive

In [3]:
# drive.mount('/content/gdrive/', force_remount=True)
# base_path = '/content/gdrive/My Drive/iust/miniVQA/'
# output_path = base_path
base_path = '/kaggle/input/minivqaiust/'
output_path = '/kaggle/working/'

## Setting up GPU

In [4]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
device

device(type='cuda')

## Reading data

### Answers

In [5]:
all_answers = [ 'surfboard', 'eating', 'cake', 'table', 'hat', 'giraffe', 'broccoli', 'woman', 'sunny', 'apple']

### Image features

In [13]:
with open(base_path + 'image_features.pickle', 'rb') as f:
    image_features = pickle.load(f)

### Questions

In [14]:
with open(base_path + 'image_question.json', 'r') as f:
  img_to_q_dict = json.load(f)
  questions = []
  for img_id, img_qs in img_to_q_dict.items():
    for img_q in img_qs:
      q_id, q_text = img_q
      questions.append({
        'q_id': q_id,
        'q_text': q_text,
        'img_id': img_id
      })

questions = sorted(questions, key= lambda q: q['q_id'])

### Subsets

In [15]:
train_csv = pd.read_csv(base_path + 'train.csv', index_col="question_id").sort_index()
train_csv.head()

train_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in train_csv.index.values]
train_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in train_csv.index.values]


train_q = train_csv["question_text"].values.tolist()
train_a = torch.from_numpy(train_csv["label"].values)


In [8]:
valid_csv = pd.read_csv(base_path + 'val.csv', index_col="question_id").sort_index()
valid_csv.head()

valid_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in valid_csv.index.values]
valid_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in valid_csv.index.values]


valid_q = valid_csv["question_text"].values.tolist()
valid_a = torch.from_numpy(valid_csv["label"].values)


In [9]:
test_csv = pd.read_csv(base_path + 'test.csv', index_col="question_id").sort_index()
test_csv.head()

test_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in test_csv.index.values]
test_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in test_csv.index.values]


test_q = test_csv["question_text"].values.tolist()

# Preprocess questions

## Preprocess questions


### Define consts

In [10]:
max_length = 17

In [11]:
# Tokenize
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [16]:
# Apply on train
train_q_input_ids = tokenizer(train_q, max_length=max_length,  truncation=True,padding='max_length', return_tensors="pt").input_ids
train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])
print('Train image features shape:', train_img.shape)

Train image features shape: torch.Size([780, 512])


  train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])


In [17]:
# Apply on valid
valid_q_input_ids = tokenizer(valid_q, max_length=max_length,  truncation=True,padding='max_length', return_tensors="pt").input_ids
valid_img = torch.Tensor([image_features[img_id] for img_id in valid_csv["image_id"].values])
print('Valid image features shape:', valid_img.shape)

Valid image features shape: torch.Size([110, 512])


In [19]:
# Apply on test
test_q_input_ids = tokenizer(test_q, max_length=max_length,  truncation=True,padding='max_length', return_tensors="pt").input_ids
test_img = torch.Tensor([image_features[img_id] for img_id in test_csv["image_id"].values])
print('Test image features shape:', test_img.shape)

Test image features shape: torch.Size([110, 512])


## Create dataset and dataloader

In [29]:
batch_size = 64

In [31]:
train_dataset = torch.utils.data.TensorDataset(train_q_input_ids, train_img, train_a)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

In [47]:
valid_dataset = torch.utils.data.TensorDataset(valid_q_input_ids, valid_img, valid_a)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

# Build model

In [25]:
# empty memory
del tokenizer
del image_features
del questions
import gc
gc.collect

<function gc.collect(generation=2)>

In [48]:
class MiniVQA(nn.Module):
    def __init__(self, text_features, image_features):
        super(type(self), self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.linears = nn.Sequential(
            nn.Linear(text_features + image_features, 1024),
            nn.BatchNorm1d(1024),
            nn.Tanh(),
            nn.Linear(1024, 256),
            nn.Tanh(),
            nn.Linear(256, 10),
            nn.Tanh()
        )


    def forward(self, text_input_ids, image):
        text = self.bert(input_ids=text_input_ids).last_hidden_state
        text = torch.flatten(text, start_dim=1)
        features = torch.cat([text, image], dim=1)
        logits = self.linears(features)
        return nn.functional.softmax(logits, dim=1)


In [55]:
miniVQA = MiniVQA(13056 , 512)
miniVQA.to(device)

MiniVQA(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

# Train model

## Define constants

In [35]:
learning_rate = 3e-4
epochs = 5

## Define train loop

In [56]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(miniVQA.parameters(), lr=learning_rate)

In [45]:
def pred_val(model, dataloader):
  size = len(dataloader.dataset)
  correct = 0
  avg_loss = 0
  for batch, (text, image, y) in enumerate(dataloader):
    pred = model(text.to(device), image.to(device))
    loss = loss_fn(pred, y.to(device))
    output = [torch.argmax(o).item() for o in pred]
    correct += (torch.FloatTensor(output) == y).float().sum()
    avg_loss += loss.item()
  acc = correct / size
  return avg_loss, correct, acc

In [53]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    correct = 0
    avg_loss = 0

    for batch, (text, image, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(text.to(device), image.to(device))
        loss = loss_fn(pred, y.to(device))
        # Backpropagation
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        output = [torch.argmax(o).item() for o in pred]
        correct += (torch.FloatTensor(output) == y).float().sum()
        avg_loss += loss.item()

    avg_loss /= (size // 64 + 1)
    acc = correct / size
    val_loss, val_correct, val_acc = pred_val(miniVQA, valid_dataloader)
    print(f"training/ loss: {avg_loss:>7f} | accuracy: {acc}")
    print(f"val/ loss: {val_loss:>7f} | accuracy: {val_acc}")


In [57]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, miniVQA, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
training/ loss: 2.161425 | accuracy: 0.6705127954483032
val/ loss: 4.171316 | accuracy: 0.8454545736312866
Epoch 2
-------------------------------
training/ loss: 2.060923 | accuracy: 0.8717948794364929
val/ loss: 4.084198 | accuracy: 0.8454545736312866
Epoch 3
-------------------------------
training/ loss: 2.018894 | accuracy: 0.9051281809806824
val/ loss: 4.039418 | accuracy: 0.8818181753158569
Epoch 4
-------------------------------
training/ loss: 2.002517 | accuracy: 0.9179487228393555
val/ loss: 4.033000 | accuracy: 0.8727272748947144
Epoch 5
-------------------------------
training/ loss: 1.994862 | accuracy: 0.942307710647583
val/ loss: 3.997197 | accuracy: 0.9272727370262146
Done!


# Predict

In [59]:
pred = miniVQA(test_q_input_ids.to(device), test_img.to(device))
output = [torch.argmax(o).item() for o in pred]

df = pd.DataFrame({
    'question_id': sorted(test_csv.index.values),
    'label': output
})
print(df.label.values)
df.to_csv(output_path + 'minivqa-v2.2-submission.csv', index=False)

[1 1 8 8 9 0 3 3 3 5 2 2 6 7 0 7 4 2 0 9 4 4 4 4 4 0 2 2 7 8 4 2 7 3 1 4 1
 8 1 1 9 3 9 2 5 7 3 6 3 8 0 6 5 5 5 6 8 7 9 0 9 5 9 8 0 7 7 0 6 1 4 9 3 4
 2 6 1 5 8 5 4 2 1 9 6 9 5 3 2 8 0 7 7 9 4 9 1 8 9 1 6 9 0 8 3 5 6 6 9 6]
