## ANNDL_Homework3


### Introduction

After having analysed carefully the code presented in class, we searched online for a few solutions and we took advantage of the code presented in this GitHub [repository](https://github.com/moduIo/Relation-Networks): an implementation of Relation Networks for Visual Question Answering using the CLEVR dataset; this repository is in turn inspired by the following [research](https://arxiv.org/pdf/1706.01427.pdf).

We divided our dataset in train and validation (80-20 split) using the function train_test_split from sklearn library. 

The fixed random seed present in the code makes our results reproducible.

Images are preprocessed: they are resized to 128x128 pixels.


### Image Processing

Since we cannot use the Default Keras' data augmentation function, we resize manually all the images by calling *process_image* function. 
We tried to implement a sort of data augmentation function by simply rotating the tensors (the 128x128 preprocessed images) but we encountered a library problem (we were using a deprecated function).
We searched online to solve this problem but we didn't find anything. 
We are quite sure that by simply adding this the accuracy would have increased. 


### Model fitting 

Since we are using a custom data generator, we have to pass it directly to the model.fit function. 
The data generator function (*load_data_generator*), instead of reading the entire dataset, reads *n* times a batch of randomly sampled images (with relative question and answer). 
The accuracy was not that bad but it probably overfits quickly since the same images may be randomly sampled more than once.
Another problem, in our opinion, is that some answers in the training dataset appear only few times and some of them may end-up (after the split) in the validation part. 
This causes the problem of never having trained the model with some cases.

In [None]:
from __future__ import print_function
import json
import os.path
import random as ra
import tensorflow as tf
import numpy as np
import keras
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import Input, Dense, Dropout, BatchNormalization, Reshape, Lambda, Embedding, LSTM, Conv2D, MaxPooling2D, TimeDistributed, RepeatVector, Concatenate
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, TensorBoard
from scipy import ndimage, misc
import imageio
from PIL import Image
import tqdm
import random
from sklearn.model_selection import train_test_split

In [None]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"]="-1" 
import tensorflow as tf
import numpy as np

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 1234
tf.random.set_seed(SEED)  

# Get current working directory
cwd = os.getcwd()

# Set GPU memory growth
# Allows to only as much GPU memory as needed
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

In [None]:
samples = 7000
epochs = 100
batch_size = 64
learning_rate = .00025
vocab_size = 1024
sequence_length = 64
img_rows, img_cols = 320, 480
image_input_shape = (img_rows, img_cols, 3)
num_labels = 13
current = 0

In [None]:
def process_image(x):
    target_height, target_width = 128, 128
    x = tf.image.resize(x, (target_height, target_width), method=tf.image.ResizeMethod.AREA)
    return x

In [None]:
def get_relation_vectors(x):
    objects = []
    relations = []
    shape = K.int_shape(x)
    k = 25     # Hyperparameter which controls how many objects are considered
    keys = []

    # Get k unique random objects
    while k > 0:
        i = ra.randint(0, shape[1] - 1)
        j = ra.randint(0, shape[2] - 1)

        if not (i, j) in keys:
            keys.append((i, j))
            objects.append(x[:, i, j, :])
            k -= 1

    # Concatenate each pair of objects to form a relation vector
    for i in range(len(objects)):
        for j in range(i, len(objects)):
            relations.append(K.concatenate([objects[i], objects[j]], axis=1))

    # Restack objects into Keras tensor [batch, relation_ID, relation_vectors]
    return K.permute_dimensions(K.stack([r for r in relations], axis=0), [1, 0, 2])

In [None]:
callbacks=[]
early_stop = True

if early_stop:
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    callbacks.append(es_callback)

In [None]:
text_inputs = Input(shape=(sequence_length,), name='text_input')
text_x = Embedding(vocab_size, 128)(text_inputs)
text_x = LSTM(128)(text_x)

image_inputs = Input(shape=image_input_shape, name='image_input')
image_x = Lambda(process_image)(image_inputs)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
shape = K.int_shape(image_x)

RN_inputs = Input(shape=(1, (2 * shape[3]) + K.int_shape(text_x)[1]))
RN_x = Dense(256, activation='relu')(RN_inputs)
RN_x = Dense(256, activation='relu')(RN_x)
RN_x = Dense(256, activation='relu')(RN_x)
RN_x = Dropout(.5)(RN_x)
RN_outputs = Dense(256, activation='relu')(RN_x)
RN = Model(inputs=RN_inputs, outputs=RN_outputs)

relations = Lambda(get_relation_vectors)(image_x)           # Get tensor [batch, relation_ID, relation_vectors]
question = RepeatVector(K.int_shape(relations)[1])(text_x)  # Shape question vector to same size as relations
relations = Concatenate(axis=2)([relations, question])      # Merge tensors [batch, relation_ID, relation_vectors, question_vector]
g = TimeDistributed(RN)(relations)                          # TimeDistributed applies RN to relation vectors.
g = Lambda(lambda x: K.sum(x, axis=1))(g)                   # Sum over relation_ID

f = Dense(256, activation='relu')(g)
f = Dropout(.5)(f)
f = Dense(256, activation='relu')(f)
f = Dropout(.5)(f)
outputs = Dense(num_labels, activation='softmax')(f)
model = Model(inputs=[text_inputs, image_inputs], outputs=outputs)
model.compile(optimizer=Adam(lr=learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
def load_data_generator(data, n=64):
    current = 0
    while True:
        path = '/kaggle/input/ann-and-dl-vqa/dataset_vqa/'
        questions_path = path + '/train_data.json'
        images_path = path + '/train/'
        tokenize = None
        batch_data = []
        x_text = []     # List of questions
        x_image = []    # List of images
        y = []          # List of answers
        num_labels = 0  # Current number of labels, used to create index mapping
        labels = {}     # Dictionary mapping of ints to labels
        images = {}     # Dictionary of images, to minimize number of imread ops
        
        current_data = data [current:current+n]
        current += n
        if current+n == len(data):
            current = 0
            
        batch_data.append(current_data)

        labels= {'0': 0, '1': 1, '10': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'no': 11, 'yes': 12}

        for q in batch_data[0][0:n]:
            if not q['image_filename'] in images:
                images[q['image_filename']] = imageio.imread(images_path + q['image_filename'], pilmode="RGB")

            x_text.append(q['question'])
            x_image.append(images[q['image_filename']])
            y.append(labels[q['answer']])
        tokenizer = Tokenizer(num_words=vocab_size)

        tokenizer.fit_on_texts(x_text)
        sequences = tokenizer.texts_to_sequences(x_text)
        x_text = sequence.pad_sequences(sequences, maxlen=sequence_length)

        # Convert x_image to np array
        x_image = np.array(x_image)

        # Convert labels to categorical labels
        y = keras.utils.to_categorical(y, num_labels)
        yield ([x_text, x_image], y)

In [None]:
def load_data_test(n, vocab_size, sequence_length, tokenizer=None):
    path = '/kaggle/input/ann-and-dl-vqa/dataset_vqa/'
    questions_path = path + '/test_data.json'
    images_path = path + '/test/'

    x_text = []     # List of questions
    x_image = []    # List of images
    num_labels = 0  # Current number of labels, used to create index mapping
    labels = {}     # Dictionary mapping of ints to labels
    images = {}     # Dictionary of images, to minimize number of imread ops

    # Attempt to load saved JSON subset of the questions
    print('Loading data...')
        
    with open(questions_path) as f:
        data = json.load(f)

    data = data['questions'][0:n]
    
    for q in data[0:n]:
        # Create an index for each image
        if not q['image_filename'] in images:
            images[q['image_filename']] = imageio.imread(images_path + q['image_filename'], pilmode="RGB")

        x_text.append(q['question'])
        x_image.append(images[q['image_filename']])
        
    # Convert question corpus into sequential encoding for LSTM
    print('Processing text data...')
    if not tokenizer:
        tokenizer = Tokenizer(num_words=vocab_size)

    tokenizer.fit_on_texts(x_text)
    sequences = tokenizer.texts_to_sequences(x_text)
    x_text = sequence.pad_sequences(sequences, maxlen=sequence_length)

    # Convert x_image to np array
    x_image = np.array(x_image)

    print('Text: ', x_text.shape)
    print('Image: ', x_image.shape)

    return ([x_text, x_image]), tokenizer

In [None]:
path = '/kaggle/input/ann-and-dl-vqa/dataset_vqa/'
questions_path = path + '/train_data.json'

with open(questions_path) as f:
    data = json.load(f)
    
data = data['questions']
data_train, data_valid= train_test_split(data, test_size=0.2, random_state=SEED)

In [None]:
model.fit(load_data_generator(data_train),
          epochs=epochs,
          steps_per_epoch=len(data_train)//batch_size,
          validation_data = load_data_generator(data_valid),
          callbacks=callbacks,
          validation_steps=len(data_valid)//batch_size)

In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')

In [None]:
(test), tok = load_data_test(samples, vocab_size, sequence_length)

In [None]:
out_softmax = model.predict(test)
predicted_class = out_softmax.argmax(axis=-1)

d = {}
for i in range(0, 3000):
    d[i] = predicted_class[i]

create_csv(d)