## Mount Drive (optional)

In [0]:
# Run this cell then jump to Importing Libraries to skip mounting drive
workdir = None

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
os.chdir("/content/gdrive/My Drive/CS_543_Final") # edit as needed

In [0]:
ls

In [0]:
from pathlib import Path
workdir = Path('.')

## Importing libraries:

In [0]:
!pip install transformers
!pip install easy-vqa

In [0]:
from easy_vqa import get_train_questions, get_test_questions, get_train_image_paths, get_test_image_paths
from PIL import Image
import numpy as np
from tqdm.notebook import tqdm

## Importing data from the Easy-VQA library:

In [0]:
#Getting answers and image IDs for each question:
train_questions, train_answers, train_image_ids = get_train_questions()
test_questions, test_answers, test_image_ids = get_test_questions()

In [0]:
# Printing the first question, answer and image ID: 
print('Example:')
print('Question:', train_questions[0])
print('Answer:', train_answers[0])
print('Image ID:', train_image_ids[0])

In [0]:
train_image_paths = get_train_image_paths()
test_image_paths = get_test_image_paths()

train_images = {}
test_images = {}

for idx in train_image_ids:
    train_images[idx] = Image.open(train_image_paths[idx])

for idx in test_image_ids:
    test_images[idx] = Image.open(test_image_paths[idx])

In [0]:
train_images[0]

In [0]:
test_images[0]

# Getting Image Features 
This section will take an image and get its feature vector



In [0]:

import torch
import torchvision as tv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch.utils.data as data
import os 
import csv
from time import perf_counter

# sample image 
img =  train_images[0]
plt.imshow(img)
plt.show()


## Preprocessing images


In [0]:
transform_to_tensor = tv.transforms.Compose([tv.transforms.ToTensor()])
train_tensors = []
# transforming each image to tensor to compute mean and std
for img in train_images.values():
  img_tensor = transform_to_tensor(img)
  train_tensors.append(img_tensor)

loader = data.DataLoader(train_tensors,
                         batch_size=10,
                         num_workers=0,
                         shuffle=False)

train_mean = 0.
train_std = 0.

for images in loader:
    batch_samples = images.size(0) # batch size (the last batch can have smaller size!)
    images = images.view(batch_samples, images.size(1), -1)
    train_mean += images.mean(2).sum(0)
    train_std += images.std(2).sum(0)

train_mean /= len(loader.dataset)
train_std /= len(loader.dataset)

print("train mean, train std: ",train_mean,train_std)
# transforming the image 
transform = tv.transforms.Compose([tv.transforms.ToTensor(),
                                   tv.transforms.Normalize(mean=train_mean,std =train_std)])


train_dataset = [transform(img) for img in train_images.values()] 
test_dataset = [transform(img) for img in test_images.values()] 



In [0]:

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# using a pretrained mask rcnn with a resnet50 backbone
rcnn_model = tv.models.detection.maskrcnn_resnet50_fpn(pretrained=True).to(device)
# switching to inference mode
rcnn_model.eval()

# possible modules to use forward hook on
print(rcnn_model._modules.keys())

# this prints the full model architecture      
# print(list(models.model.children())[:-1]

In [0]:

# options for layer are (['transform', 'backbone', 'rpn', 'roi_heads']) 
layer = 'rpn'
my_module = rcnn_model._modules.get(layer)


# Testing the hook with one image
img = transform(img)
img = img.to(device)
# initizialing tensor for storing image features
my_embedding = torch.zeros([1000,4])

def fun(m, i, o): 
    """takes in the module information (m), inputs (i), and outputs (o) 
    and determines what is put in the embeddings tensor"""
    my_embedding.copy_(o[0][0]) # declare what to hook here 
    
h = my_module.register_forward_hook(fun)
h_x = rcnn_model([img])
h.remove()

print("\nimage embeddings size: ",my_embedding.size())

In [0]:
if workdir is not None and (workdir / 'preprocessed_images.pt').exists():
    preprocessed_images = torch.load(workdir / 'preprocessed_images.pt').to(device)
else:
    preprocessed_images = []
    print(len(train_dataset))
    BATCH_SIZE = 1
    start = perf_counter()
    for i in range(0, len(train_dataset)):
        if i % 100 == 0:
            print('batch', i)
        #image_batch = train_dataset[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        image_batch = train_dataset[i]
        if 0 == len(image_batch):
            break
        # initizialing tensor for storing image features
        my_embedding = torch.zeros([1000,4])

        h = my_module.register_forward_hook(fun)
        h_x = rcnn_model([train_dataset[i].to(device)])
        preprocessed_images.append(my_embedding)
    preprocessed_images = torch.stack(preprocessed_images)
    stop = perf_counter()
    print(f"images preprocess time elapsed: {stop - start:.2f}s")

    if workdir is not None:
        preprocessed_images.to(torch.device("cpu"))
        p = workdir / 'preprocessed_images.pt'
        with p.open(mode='wb') as f:
            torch.save(preprocessed_images, f)

In [0]:
if workdir is not None and (workdir / 'test_preprocessed_images.pt').exists():
    test_preprocessed_images = torch.load(workdir / 'test_preprocessed_images.pt').to(device)
else:
    test_preprocessed_images = []
    print(len(test_dataset))
    BATCH_SIZE = 1
    start = perf_counter()
    for i in range(0, len(test_dataset)):
        if i % 100 == 0:
            print('batch', i)
        #image_batch = test_dataset[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        image_batch = test_dataset[i]
        if 0 == len(image_batch):
            break
        # initizialing tensor for storing image features
        my_embedding = torch.zeros([1000,4])

        h = my_module.register_forward_hook(fun)
        h_x = rcnn_model([test_dataset[i].to(device)])
        test_preprocessed_images.append(my_embedding)
    test_preprocessed_images = torch.stack(test_preprocessed_images)
    stop = perf_counter()
    print(f"images preprocess time elapsed: {stop - start:.2f}s")

    if workdir is not None:
        test_preprocessed_images.to(torch.device("cpu"))
        p = workdir / 'test_preprocessed_images.pt'
        with p.open(mode='wb') as f:
            torch.save(test_preprocessed_images, f)

In [0]:
test_preprocessed_images.shape

# Getting Text Features

In [0]:
import torch
from transformers import AutoModel, AutoTokenizer
from time import perf_counter

In [0]:
MODEL_NAME = "distilbert-base-uncased"

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

bert_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [0]:
train_questions[0]

## Preprocessing questions

In [0]:
if workdir is not None and (workdir / 'preprocessed_questions.pt').exists():
    preprocessed_questions = torch.load(workdir / 'preprocessed_questions.pt').to(device)
else:
    preprocessed_questions = []
    print(len(train_questions))
    BATCH_SIZE = 64
    start = perf_counter()
    for i in range(0, len(train_questions)):
        if i % 100 == 0:
            print('batch', i)
        question_batch = train_questions[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        if 0 == len(question_batch):
            break
        input_pt = tokenizer.batch_encode_plus(question_batch, return_tensors="pt", max_length=12, pad_to_max_length=True)
        input_pt.to(device)
        with torch.no_grad():
            bert_output = bert_model(**input_pt)[0]
        preprocessed_questions.append(bert_output)
    print(len(preprocessed_questions))
    preprocessed_questions = torch.cat(preprocessed_questions)
    stop = perf_counter()
    print(f"questions preprocess time elapsed: {stop - start:.2f}s")

    if workdir is not None:
        preprocessed_questions.to(torch.device("cpu"))
        p = workdir / 'preprocessed_questions.pt'
        with p.open(mode='wb') as f:
            torch.save(preprocessed_questions, f)

In [0]:
if workdir is not None and (workdir / 'test_preprocessed_questions.pt').exists():
    test_preprocessed_questions = torch.load(workdir / 'test_preprocessed_questions.pt').to(device)
else:
    test_preprocessed_questions = []
    print(len(test_questions))
    BATCH_SIZE = 64
    start = perf_counter()
    for i in range(0, len(test_questions)):
        if i % 100 == 0:
            print('batch', i)
        question_batch = test_questions[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        if 0 == len(question_batch):
            break
        input_pt = tokenizer.batch_encode_plus(question_batch, return_tensors="pt", max_length=12, pad_to_max_length=True)
        input_pt.to(device)
        with torch.no_grad():
            bert_output = bert_model(**input_pt)[0]
        test_preprocessed_questions.append(bert_output)
    print(len(test_preprocessed_questions))
    test_preprocessed_questions = torch.cat(test_preprocessed_questions)
    stop = perf_counter()
    print(f"questions preprocess time elapsed: {stop - start:.2f}s")

    if workdir is not None:
        test_preprocessed_questions.to(torch.device("cpu"))
        p = workdir / 'test_preprocessed_questions.pt'
        with p.open(mode='wb') as f:
            torch.save(test_preprocessed_questions, f)

In [0]:
test_preprocessed_questions.shape

# Creating Model

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from easy_vqa import get_answers

In [0]:
get_answers()

In [0]:
class VQA(nn.Module):

    def __init__(self): 
        super(VQA, self).__init__()

        self.lin_img = nn.Sequential(
            nn.Linear(1000*4, 10000),
            nn.ReLU(inplace=True)
            )
        
        self.lin_ques = nn.Sequential(
            nn.Linear(12*768, 10000),
            nn.ReLU(inplace=True)
            )
        
        self.lin1 = nn.Sequential(
            nn.Linear(1*20000, 10000),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(10000)
            )
        
        self.lin2 = nn.Sequential(
            nn.Linear(1*10000, 500),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(500)
            )
        
        self.lin3 = nn.Linear(1*500, 13)

    def forward(self, img, ques, method='concat'):
        img = img.view(-1, 1000*4)
        img = self.lin_img(img)
        #print(img.shape)

        ques = ques.view(-1, 12*768)
        ques = self.lin_ques(ques)
        #print(ques.shape)

        if method == 'concat':
            x = torch.cat([img, ques], 1)
            x = x.view(-1, 1*20000)
            x = self.lin1(x)
        elif method == 'avg':
            x = (img.add(ques))/2
            x = x.view(-1, 1*10000)
        elif method == 'hadamard':
            x = img * ques
            x = x.view(-1, 1*10000)
        
        # x = x.view(-1, 1*20000)
        # x = self.lin1(x)
        x = self.lin2(x)
        x = self.lin3(x)
        #print(x.shape)

        return x

# Creating Dataloader

In [0]:
TRAIN_BS = 256
TEST_BS = 64

cpu = torch.device("cpu")
preprocessed_images.to(cpu)
preprocessed_questions.to(cpu)
test_preprocessed_images.to(cpu)
test_preprocessed_questions.to(cpu)

train_loader = []
test_loader = []
answers = get_answers()
# convert ['circle', 'green', ...] to {'circle':0, 'green':1, ...}
answer2id = dict(zip(answers, range(len(answers))))
id2answer = {v:k for (k,v) in answer2id.items()}

for i in range(preprocessed_questions.shape[0]):
    img_id = train_image_ids[i]
    answer_str = train_answers[i]
    answer_id = answer2id[answer_str]
    train_loader.append((preprocessed_questions[i], preprocessed_images[img_id], answer_id))

train_dataloader = data.DataLoader(train_loader, batch_size=TRAIN_BS, 
                                    shuffle=True, num_workers=0,
                                    drop_last=True)

for i in range(test_preprocessed_questions.shape[0]):
    img_id = test_image_ids[i]
    answer_str = test_answers[i]
    answer_id = answer2id[answer_str]
    test_loader.append((test_preprocessed_questions[i], test_preprocessed_images[img_id], answer_id))

test_dataloader = data.DataLoader(test_loader, batch_size=TEST_BS, 
                                  shuffle=False, num_workers=0,
                                  drop_last=False)

# Training

In [0]:
#Setting up GPU:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

#Calling the model:
model = VQA()
model.to(device)

#Setting the optimizer and loss functions:
optimizer = optim.Adam(model.parameters(), lr = 1e-6)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
loss_fn = nn.CrossEntropyLoss()

#Setting variables for training and test loss and accuracies:
epochs = 30
#predictions = []
total = 0
correct = 0
test_total = 0
test_correct = 0
epoch_acc = 0
epoch_loss = 0
epoch_test_acc = 0
epoch_test_loss = 0
# best_loss = float('inf')
log = []

for epoch in range(epochs):
    
    #Training loop:
    model.train()
    for idx, (question_batch, image_batch, answer_batch) in enumerate(tqdm(train_dataloader)):
        question_batch, image_batch, answer_batch = question_batch.to(device), image_batch.to(device), answer_batch.to(device)

        #print(image_batch.shape)
        #print(question_batch.shape)

        pred = model(image_batch, question_batch, method='hadamard') #options for method are 'avg' and 'concat'

        loss = loss_fn(pred, answer_batch)
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(pred.data, 1)
        #predictions.extend(list(predicted.cpu().numpy()))
        total += answer_batch.size(0)
        correct += (predicted == answer_batch).sum().item()
        acc = 100*correct/total

        epoch_loss = epoch_loss + loss.item()
        epoch_acc = epoch_acc + acc
    
    model.eval()
    predictions = []
    truths = []
    for idx, (question_batch, image_batch, answer_batch) in enumerate(tqdm(test_dataloader)):
        question_batch, image_batch, answer_batch = question_batch.to(device), image_batch.to(device), answer_batch.to(device)
        with torch.no_grad():
            pred = model(image_batch, question_batch, method='hadamard') #options for method are 'avg' and 'concat'
        loss = loss_fn(pred, answer_batch)
        _, predicted = torch.max(pred.data, 1)
        predictions.extend(list(predicted.cpu().numpy()))
        truths.extend(list(answer_batch.cpu().numpy()))
        test_total += answer_batch.size(0)
        test_correct += (predicted == answer_batch).sum().item()
        test_acc = 100*test_correct/test_total

        epoch_test_loss = epoch_test_loss + loss.item()
        epoch_test_acc = epoch_test_acc + test_acc
    
    print('Epoch', epoch+1)
    print(f'Training loss: {epoch_loss:.3f}')
    print(f'Training accuracy: {acc:.2f}%')
    print(f'Validation loss: {epoch_test_loss:.3f}')
    print(f'Validation accuracy: {test_acc:.2f}%')
    print('---------------------------------------------')
    log.append((epoch_loss, acc, epoch_test_loss, test_acc))

    # break

In [0]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(truths, predictions)
np.savetxt("confusion.csv", cm, delimiter=",")
print(cm)

In [0]:
log = np.array(log)

In [0]:
np.savetxt("training_log.csv", log, delimiter=",")