# Visual Question Answering

## Installing Libraries

In [None]:
# conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
%pip install ftfy regex tqdm --user
%pip install git+https://github.com/openai/CLIP.git --user
%pip install pandas --user
%pip install wordcloud --user

## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import clip
import json
from torch.utils.data import Dataset, DataLoader
from wordcloud import WordCloud

import modelArchitecture

## Global Variables

In [None]:
INPUT_PATH = 'vizwiz'
ANNOTATIONS = INPUT_PATH + '/Annotations/Annotations'
TRAIN_PATH = INPUT_PATH + '/train/train'
VALIDATION_PATH = INPUT_PATH + '/val/val'
ANNOTATIONS_TRAIN_PATH = ANNOTATIONS + '/train.json'
ANNOTATIONS_VAL_PATH = ANNOTATIONS + '/val.json'
OUTPUT_PATH = ''
ANSWER_SPACE = 3129
DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"

## Processing Data

In [2]:
def create_dataframe(path, output_path, type = 'train'):

    output_path +='/dataframes'
    
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    df = pd.read_json(path)
    df = df[['image', 'question', 'answers', 'answer_type', 'answerable']]

    # create a new DataFrame to hold the expanded rows
    expanded_df = pd.DataFrame(columns=df.columns)

    # iterate over each row in the original DataFrame
    for index, row in df.iterrows():
        # iterate over each answer in the answers column
        for answer in row['answers']:
            # create a new row with the current answer
            new_row = row.copy()
            new_row['answer'] = answer['answer']
            new_row['answer_confidence'] = 1 if answer['answer_confidence'] == 'yes' else 0.5 if answer['answer_confidence'] == 'maybe' else 0
            # add the new row to the expanded DataFrame
            expanded_df = pd.concat([expanded_df, pd.DataFrame(new_row).transpose()], ignore_index=True)
    
    # Save the expanded DataFrame to a CSV file
    if type == 'train':
        expanded_df.to_csv(output_path + '/train.csv', index=False)
    else:
        expanded_df.to_csv(output_path + '/val.csv', index=False)
    

def load_dataframe(path, type = 'train'):
    path += '/dataframes'
    if type == 'train':
        return pd.read_csv(path + '/train.csv')
    else:
        return pd.read_csv(path + '/val.csv')
    
def plot_histogram(dataframe, column):
    plt.hist(dataframe[column])
    plt.title(column)
    plt.show()

def plot_pie(dataframe, column):
    plt.pie(dataframe[column].value_counts(), labels = dataframe[column].value_counts().index)
    plt.title(column)
    plt.show()

def plot_boxplot(dataframe, column):
    plt.boxplot(dataframe[column])
    plt.title(column)
    plt.show()

def plot_wordcloud(dataframe, column):
    text = " ".join([word for word in dataframe[column]])
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    min_font_size = 10).generate(text) 
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

def explore_dataframe(dataframe):

    # let's see the distribution of the answer types
    plot_pie(dataframe, 'answer_type')

    # let's see the distribution of the answerable column
    plot_pie(dataframe, 'answerable')

    # let's see the distribution of the answer_confidence column
    plot_pie(dataframe, 'answer_confidence')

    # let's plot histograms for the answer_confidence column
    plot_histogram(dataframe, 'answer_confidence')

    # let's plot boxplot for the answer_confidence column
    plot_boxplot(dataframe, 'answer_confidence')

    # let's plot histograms for the answerable column
    plot_histogram(dataframe, 'answerable')

    # let's see the distribution of the question column
    plot_wordcloud(dataframe, 'question')

    # let's see the distribution of the answer column
    plot_wordcloud(dataframe, 'answer')

    # Let's see how many distinct answers we have
    print("Number of distinct answers: ", get_number_of_distinct_answers(dataframe))

def get_number_of_distinct_answers(dataframe):
    return len(dataframe['answer'].unique())

In [3]:
train_df = create_dataframe(ANNOTATIONS_TRAIN_PATH, OUTPUT_PATH, 'train')
val_df = create_dataframe(ANNOTATIONS_VAL_PATH, OUTPUT_PATH, 'val')

## Loading Dataframes

In [None]:
train_df = load_dataframe(OUTPUT_PATH, 'train')
val_df = load_dataframe(OUTPUT_PATH, 'val')
ANSWER_SPACE = get_number_of_distinct_answers(train_df)

## Exploratory Data Analysis

- We will perform EDA on only the train dataset first, then we will perform EDA on the validation dataset.

### Training Dataframe

In [None]:
explore_dataframe(train_df)

### Validation Dataframe

In [None]:
explore_dataframe(val_df)

## Creating Model

In [18]:
model = modelArchitecture.VQAModel(num_classes=10, device= DEVICE, hidden_size=512, model_name="RN50x4")

## Creating Dataset Class

In [None]:
class VizWizDataset(torch.utils.data.Dataset):
    def __init__(self, data, path):
        self.data = data
        self.path = path
        self.transform = clip.load('ViT-B/32', device='cuda')
        self.tokenizer = clip.tokenize

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.path + '/' + self.data[idx]['image']
        img = plt.imread(img_path)
        img = torch.tensor(img).permute(2, 0, 1)
        img = img / 255.0
        img = img.unsqueeze(0)
        img = img.to('cuda')
        text = self.data[idx]['question']
        text = self.tokenizer(text).to('cuda')
        return img, text

## Training Model