In this notebook, a **VQA** model is implemented using **PyTorch** library.

- Question features are extracted using
  - **DistilBert** (no fine-tuning)
- Image features are available in the dataset.
- The question and image features are fused with
  - **Concatenation**
- The correct answer is predicted with a Dense layer.

**Best Validation Accuracy: 0.809**

# Imports

In [1]:
!pip install transformers



In [3]:
import pandas as pd
import torch
import pickle
from torch import nn
import json
# from google.colab import drive
from transformers import DistilBertTokenizer, DistilBertModel



# Loading data

## Connecting to drive

In [33]:
# drive.mount('/content/gdrive/', force_remount=True)
# base_path = '/content/gdrive/My Drive/iust/miniVQA/'
# output_path = base_path
base_path = '/kaggle/input/minivqaiust/'
output_path = '/kaggle/working/'

## Setting up GPU

In [6]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
device

device(type='cuda')

## Reading data

### Answers

In [7]:
all_answers = [ 'surfboard', 'eating', 'cake', 'table', 'hat', 'giraffe', 'broccoli', 'woman', 'sunny', 'apple']

### Image features

In [11]:
with open(base_path + 'image_features.pickle', 'rb') as f:
    image_features = pickle.load(f)

### Questions

In [12]:
with open(base_path + 'image_question.json', 'r') as f:
  img_to_q_dict = json.load(f)
  questions = []
  for img_id, img_qs in img_to_q_dict.items():
    for img_q in img_qs:
      q_id, q_text = img_q
      questions.append({
        'q_id': q_id,
        'q_text': q_text,
        'img_id': img_id
      })

questions = sorted(questions, key= lambda q: q['q_id'])

### Subsets

In [13]:
train_csv = pd.read_csv(base_path + 'train.csv', index_col="question_id").sort_index()
train_csv.head()

train_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in train_csv.index.values]
train_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in train_csv.index.values]


train_q = train_csv["question_text"].values.tolist()
train_a = torch.from_numpy(train_csv["label"].values)


In [14]:
valid_csv = pd.read_csv(base_path + 'val.csv', index_col="question_id").sort_index()
valid_csv.head()

valid_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in valid_csv.index.values]
valid_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in valid_csv.index.values]


valid_q = valid_csv["question_text"].values.tolist()
valid_a = torch.from_numpy(valid_csv["label"].values)


In [15]:
test_csv = pd.read_csv(base_path + 'test.csv', index_col="question_id").sort_index()
test_csv.head()

test_csv["question_text"] = [q["q_text"] for q in questions if q['q_id'] in test_csv.index.values]
test_csv["image_id"] = [q["img_id"] for q in questions if q['q_id'] in test_csv.index.values]


test_q = test_csv["question_text"].values.tolist()

# Preprocess questions

## Preprocess questions


### Define consts

In [16]:
max_length = 17

In [17]:
# Tokenize
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Bert model
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [18]:
# Apply on train
train_q_tokens = tokenizer(train_q, max_length=max_length,  truncation=True,padding='max_length', return_tensors="pt")
train_q_features = model(**train_q_tokens)
train_q_features = torch.flatten(train_q_features.last_hidden_state, start_dim=1)
print('Train q features size:', train_q_features.shape)

train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])
print('Train image features shape:', train_img.shape)

Train q features size: torch.Size([780, 13056])
Train image features shape: torch.Size([780, 512])


  train_img = torch.Tensor([image_features[img_id] for img_id in train_csv["image_id"].values])


In [19]:
# Apply on valid
valid_q_tokens = tokenizer(valid_q, max_length=max_length,  truncation=True,padding='max_length', return_tensors="pt")
valid_q_features = model(**valid_q_tokens)
valid_q_features = torch.flatten(valid_q_features.last_hidden_state, start_dim=1)
print('Valid q features size:', valid_q_features.shape)

valid_img = torch.Tensor([image_features[img_id] for img_id in valid_csv["image_id"].values])
print('Valid image features shape:', valid_img.shape)

Valid q features size: torch.Size([110, 13056])
Valid image features shape: torch.Size([110, 512])


In [20]:
# Apply on test
test_q_tokens = tokenizer(test_q, max_length=max_length,  truncation=True,padding='max_length', return_tensors="pt")
test_q_features = model(**test_q_tokens)
test_q_features = torch.flatten(test_q_features.last_hidden_state, start_dim=1)
print('Test q features size:', test_q_features.shape)

test_img = torch.Tensor([image_features[img_id] for img_id in test_csv["image_id"].values])
print('Test image features shape:', test_img.shape)

Test q features size: torch.Size([110, 13056])
Test image features shape: torch.Size([110, 512])


## Create dataset and dataloader

In [21]:
batch_size = 64

In [22]:
train_dataset = torch.utils.data.TensorDataset(train_q_features, train_img, train_a)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

In [23]:
valid_dataset = torch.utils.data.TensorDataset(valid_q_features, valid_img, valid_a)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

# Build model

In [25]:
# empty memory
del model
del tokenizer
del image_features
del questions
import gc
gc.collect

<function gc.collect(generation=2)>

In [26]:
class MiniVQA(nn.Module):
    def __init__(self, text_features, image_features):
        super(type(self), self).__init__()
        self.linears = nn.Sequential(
            nn.Linear(text_features + image_features, 1024),
            nn.BatchNorm1d(1024),
            nn.Tanh(),
            nn.Linear(1024, 256),
            nn.Tanh(),
            nn.Linear(256, 10),
            nn.Tanh()
        )


    def forward(self, text, image):
        features = torch.cat([text, image], dim=1)
        logits = self.linears(features)
        return nn.functional.softmax(logits, dim=1)


In [27]:
miniVQA = MiniVQA(13056, 512)
miniVQA.to(device)

MiniVQA(
  (linears): Sequential(
    (0): Linear(in_features=13568, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Tanh()
    (3): Linear(in_features=1024, out_features=256, bias=True)
    (4): Tanh()
    (5): Linear(in_features=256, out_features=10, bias=True)
    (6): Tanh()
  )
)

# Train model

## Define constants

In [28]:
learning_rate = 0.0001
epochs = 5

## Define train loop

In [29]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(miniVQA.parameters(), lr=learning_rate)

In [31]:
def pred_val(model, dataloader):
  size = len(dataloader.dataset)
  correct = 0
  avg_loss = 0
  for batch, (text, image, y) in enumerate(dataloader):
    pred = model(text.to(device), image.to(device))
    loss = loss_fn(pred, y.to(device))
    output = [torch.argmax(o).item() for o in pred]
    correct += (torch.FloatTensor(output) == y).float().sum()
    avg_loss += loss.item()
  acc = correct / size
  return avg_loss, correct, acc

In [30]:
from tqdm import tqdm

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    correct = 0
    avg_loss = 0
    pbar = tqdm(total=len(dataloader))

    for batch, (text, image, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(text.to(device), image.to(device))
        loss = loss_fn(pred, y.to(device))
        # Backpropagation
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        output = [torch.argmax(o).item() for o in pred]
        correct += (torch.FloatTensor(output) == y).float().sum()
        avg_loss += loss.item()
        pbar.update(1)
    pbar.close()

    avg_loss /= (size // 64 + 1)
    acc = correct / size
    val_loss, val_correct, val_acc = pred_val(miniVQA, valid_dataloader)
    print(f"training/ loss: {avg_loss:>7f} | accuracy: {acc}")
    print(f"val/ loss: {val_loss:>7f} | accuracy: {val_acc}")


In [32]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, miniVQA, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------


100%|██████████| 13/13 [03:21<00:00, 15.46s/it]


training/ loss: 2.195259 | accuracy: 0.5923076868057251
val/ loss: 4.304997 | accuracy: 0.6909090876579285
Epoch 2
-------------------------------


100%|██████████| 13/13 [03:29<00:00, 16.09s/it]


training/ loss: 2.135771 | accuracy: 0.8371794819831848
val/ loss: 4.254342 | accuracy: 0.7636363506317139
Epoch 3
-------------------------------


100%|██████████| 13/13 [03:25<00:00, 15.83s/it]


training/ loss: 2.110651 | accuracy: 0.8538461327552795
val/ loss: 4.217666 | accuracy: 0.7727272510528564
Epoch 4
-------------------------------


100%|██████████| 13/13 [03:19<00:00, 15.36s/it]


training/ loss: 2.088931 | accuracy: 0.8692307472229004
val/ loss: 4.185241 | accuracy: 0.7909091114997864
Epoch 5
-------------------------------


100%|██████████| 13/13 [03:11<00:00, 14.76s/it]

training/ loss: 2.068467 | accuracy: 0.8884615302085876
val/ loss: 4.160851 | accuracy: 0.8090909123420715
Done!





# Predict

In [36]:
pred = miniVQA(test_q_features.to(device), test_img.to(device))
output = [torch.argmax(o).item() for o in pred]

df = pd.DataFrame({
    'question_id': sorted(test_csv.index.values),
    'label': output
})
print(df.label.values)
df.to_csv(output_path + 'minivqa2-v1-submission.csv', index=False)

[1 1 8 8 4 0 3 3 0 3 2 2 6 7 0 7 4 2 0 9 4 4 4 4 0 0 2 2 2 8 4 9 7 8 1 7 1
 8 9 1 9 3 9 2 5 7 3 6 3 8 0 6 5 7 5 6 8 7 9 5 9 5 7 8 0 7 7 0 6 1 4 5 7 7
 2 3 1 5 8 5 4 2 1 6 7 9 5 3 2 8 0 7 7 7 4 7 1 8 9 1 2 9 4 8 3 5 6 6 7 2]
