# VGG - Glove 300d model
This model implements the feature extraction with a CNN based on VGG while the RNN is based on a single LSTM block. In the end the features extracted and the output of the LSTM are concatenated into a dense layer. During the training procedure we faced overfitting using:
* Dropout: 0.3 after the dense layer
* Weigth Decay: l2-norm with 1e-5
* Early Stopping: with patience 5

Images are resized to 512x256 and we used a batch of size 64 while the learning rate was 1e-3.

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

## Model hyperparameters

In [None]:
SEED = 1234
img_w = 512
img_h = 256
bs = 64
lr = 1e-3
num_epochs = 100
patience = 5

# kfold
val_split_perc = 0.2
k = 5
enable_kfold = True
if not enable_kfold:
  k = int(1 / val_split_perc)

model_name = '512x256_noWeight_gloveTrainable300d'

tf.random.set_seed(SEED)
np.random.seed(SEED)   

MAX_NUM_SENTENCES = 100000
MAX_NUM_WORDS = 999888777666

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

## Prepare Dataset
First we get the Glove embedding with the 300d size and then we unzip the vqa dataset of the challenge.

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
zip_path = "/content/glove.6B.zip"
if not os.path.exists("/content/glove.6B.300d.txt"):
  !unzip  -q {zip_path}

In [None]:
embeddings_index = {}
embedding_dim = 300
with open("/content/glove.6B.300d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
zip_path = "/content/drive/MyDrive/challenge3/dataset/anndl-2020-vqa.zip"
cwd = os.getcwd()
if not os.path.exists(os.path.join(cwd, "VQA_Dataset")):
  !unzip  -q {zip_path}

### Dataset Initialization

In [None]:
import pandas as pd 
import json
# reading the JSON data using json.load()
file_path = "/content/VQA_Dataset/train_questions_annotations.json"
with open(file_path) as train_file:
    dict_dataset = json.load(train_file)
    dataset_frame = pd.DataFrame.from_dict(dict_dataset, orient='index', dtype=("str", "str"))
    dataset_frame.reset_index(level=0, inplace=True)
    #mischia il dataset
    dataset_frame=dataset_frame.iloc[np.random.permutation(dataset_frame.index)].reset_index(drop=True)

In [None]:
test_file_path = "/content/VQA_Dataset/test_questions.json"
with open(test_file_path) as test_file:
    test_dict_dataset = json.load(test_file)
    test_dataset_frame = pd.DataFrame.from_dict(test_dict_dataset, orient='index', dtype=("str", "str"))
    test_dataset_frame.reset_index(level=0, inplace=True)

Map the answers to the classes.

In [None]:
#copied from kaggle
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}
num_classes = len(labels_dict.keys())
dataset_frame["answer"] = [labels_dict[x] for x in dataset_frame["answer"]]

In [None]:
occurrences_dict = {}
for c in np.unique(dataset_frame["answer"]):
  unique, count = np.unique(dataset_frame["answer"] == c, return_counts=True)
  occurrences_dict[c] = count[1]

#occurrences_dict
sum = 0
for v in occurrences_dict.values():
 sum+=v

sum
print("\n")
len(dataset_frame)


In [None]:
weight_dict = {}
for k in occurrences_dict.keys():
  weight_dict[k] = (1/occurrences_dict[k])*(len(dataset_frame))/float(num_classes)

## Word processing

In [None]:
def add_eos(string):
  return string + " <eos>"

#aggiungo l'eos 
dataset_frame["question"] = dataset_frame["question"].map(lambda question : question.replace("?" , ""))
dataset_frame["question"] = dataset_frame["question"].map(lambda question : question + " <eos>")

#la max len può essere un qualsiasi numero. Qua considero la  frase più lunga che ho
MAX_LEN = max([len(x.split(" ")) for x in dataset_frame["question"]]) 
print("MAX_LEN={}".format(MAX_LEN))

#riduco il numero di frasi da considerare
dataset_frame = dataset_frame[:MAX_NUM_SENTENCES]
print(len(dataset_frame))
#scarto le frasi troppo lunghe
condition = [len(x.split(" ")) <= MAX_LEN  for x in dataset_frame["question"]]
dataset_frame  = dataset_frame[condition]


In [None]:
#aggiungo l'eos 
test_dataset_frame["question"] = test_dataset_frame["question"].map(lambda question : question.replace("?" , ""))
test_dataset_frame["question"] = test_dataset_frame["question"].map(lambda question : question + " <eos>")

#la max len può essere un qualsiasi numero. Qua considero la  frase più lunga che ho
TEST_MAX_LEN = max([len(x.split(" ")) for x in test_dataset_frame["question"]]) 
print("TEST_MAX_LEN={}".format(TEST_MAX_LEN))

#riduco il numero di frasi da considerare
test_dataset_frame = test_dataset_frame[:MAX_NUM_SENTENCES]
print(len(test_dataset_frame))
#scarto le frasi troppo lunghe
condition = [len(x.split(" ")) <= TEST_MAX_LEN  for x in test_dataset_frame["question"]]
test_dataset_frame  = test_dataset_frame[condition]

## Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

question_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
question_tokenizer.fit_on_texts(dataset_frame["question"])
dataset_frame["tokenQuestion"] = question_tokenizer.texts_to_sequences(dataset_frame["question"])
word_to_int = question_tokenizer.word_index
max_question_len = max(len(sentence) for sentence in dataset_frame["tokenQuestion"])
#considero pure il padding per il numero di parole
num_words = len(word_to_int) + 1
print("Max question len after preprocessing: {}\nNum of words: {}".format(max_question_len, num_words))

In [None]:
num_tokens = num_words
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_to_int.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
test_dataset_frame["tokenQuestion"] = question_tokenizer.texts_to_sequences(test_dataset_frame["question"])
test_max_question_len = max(len(sentence) for sentence in test_dataset_frame["tokenQuestion"])

## Generators


We build two generators, the first for the training and validation data which yields __[input_sequence, input_image], labels__, the second for the test yielding just __[input_sequence, input_image]__.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image

prepr_func = tf.keras.applications.vgg16.preprocess_input

#È fondamentale non mettere nessun parametro shuffle = True, se si vuole mischiare, 
#fare shuffle sul dataframe. data_gen_args è un dictionary che contiene valori 
#per la data augmntation.
def custom_generator(dataframe, batch_size, target_h, target_w, data_gen_args=None):
  #list in which the padded questions are considered
  padded_questions = pad_questions(dataframe)
  #image generator loading
  image_generator = ImageDataGenerator()

  curr_index = 0
  while(True):
    #estrazione delle prossime batch_size frasi
    input_sequences = []
    input_images = []
    labels = []
    i = curr_index
    for _ in range(batch_size):
      index = i % len(dataframe["index"])
      #estrai frase
      input_sequences.append(extract_sequence(padded_questions, index))
      #estrai immagine
      input_images.append(extract_image(dataframe, index, target_h, target_w, image_generator)) 
      #estrazione della label
      label = dataframe["answer"][index]
      labels.append(label)

      i+=1

    curr_index += batch_size

    #adding the batch dimension
    input_sequences = np.array(input_sequences)
    input_images = np.array(input_images)
    labels = np.array(labels)
    
    yield [input_sequences, input_images], labels

def test_generator(dataframe, batch_size, target_h, target_w, data_gen_args=None):
  padded_questions = pad_questions(dataframe)
  #image generator loading
  image_generator = ImageDataGenerator()
  curr_index = 0
  while(True):
    #estrazione delle prossime batch_size frasi
    input_sequences = []
    input_images = []
    i = curr_index
    for _ in range(batch_size):
      index = i % len(dataframe["index"])
      #estrai frase
      input_sequences.append(extract_sequence(padded_questions, index))
      #estrai immagine
      input_images.append(extract_image(dataframe, index, target_h, target_w, image_generator)) 
      i+=1

    curr_index += batch_size

    #adding the batch dimension
    input_sequences = np.array(input_sequences)
    input_images = np.array(input_images)

    yield [input_sequences, input_images]

def pad_questions(dataframe):
  #list in which the padded questions are considered
  padded_questions = None
  #data padding
  max_question_len = max(len(sentence) for sentence in dataframe["tokenQuestion"])
  padded_questions = pad_sequences(dataframe["tokenQuestion"], maxlen=max_question_len, padding="post")
  return padded_questions

def extract_sequence(questions, index):
  return questions[index]

def extract_image(dataframe, index, target_h, target_w, image_generator):
  image_id = dataframe["image_id"][index]
  image_path = os.path.join("VQA_Dataset", "Images", str(image_id)+".png")
  image = Image.open(image_path).resize((target_w, target_h)).convert('RGB')
  image_arr = np.array(image)
  if prepr_func != None:
    image_arr = prepr_func(image_arr)
  return image_arr

## Prepare for Training
This notebook allows to perform K-fold cross-validation by setting the respective parameter to True at the beginning. The model built uses VGG to implement the CNN to perform the feature extraction while a single LSTM block realizes the RNN. The features and output of the LSTM are finally concatenated into a dense layer. The model realized is compiled with __sparse_categorical_crossentropy__ and __Adam_optimizer__, before starting the fitting procedure.

In [None]:
from datetime import datetime

exps_dir = '/content/drive/My Drive/challenge3/'
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

exp_dir = os.path.join(exps_dir, '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)

In [None]:
import math
from keras.applications import VGG16
from sklearn.model_selection import KFold

kfold = KFold(n_splits=k, random_state=SEED, shuffle=True)

loop_iteration = 0
loss_arr = []

for train_index, val_index in kfold.split(dataset_frame, dataset_frame["answer"]):
  train_dataframe = dataset_frame.iloc[train_index]
  val_dataframe = dataset_frame.iloc[val_index]
  train_dataframe.reset_index(level=0, inplace=True)
  val_dataframe.reset_index(level=0, inplace=True)

  train_generator = custom_generator(train_dataframe, bs, img_h, img_w)
  val_generator = custom_generator(train_dataframe, bs, img_h, img_w)

  # CNN for iamges
  image_input = tf.keras.Input(shape=(img_h, img_w, 3))

  vgg16 = VGG16(weights='imagenet', include_top=False, input_shape=(img_h, img_w, 3))
  vgg16.trainable = False
  vgg_out = vgg16(image_input)

  x = tf.keras.layers.Flatten()(vgg_out)
  x = tf.keras.layers.Dense(256, kernel_regularizer=tf.keras.regularizers.l2(1e-5), activation='relu')(x)
  x = tf.keras.layers.Dropout(0.3)(x)

  # LSTM for text
  encoder_input = tf.keras.Input(shape=[max_question_len])
  encoder_embedding_out = tf.keras.layers.Embedding(
      num_words,
      embedding_dim,
      embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
      input_length=max_question_len,
      mask_zero=True
  )(encoder_input)

  #Embedding(num_words, EMBEDDING_SIZE, input_length=max_question_len, mask_zero=True)(encoder_input)

  #encoder_embedding_out = encoder_embedding_layer(encoder_input)
  encoder = tf.keras.layers.LSTM(units=512)(encoder_embedding_out)

  # finally concatenate
  concatenated = tf.keras.layers.concatenate([x, encoder], axis=-1)
  output = tf.keras.layers.Dense(num_classes, activation='softmax')(concatenated)
  model = tf.keras.Model([encoder_input, image_input], output)

  # Loss
  loss = tf.keras.losses.SparseCategoricalCrossentropy()

  # learning rate
  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

  # Validation metrics
  metrics = ['accuracy']

  # Compile Model
  model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

  callbacks = []

  # Model checkpoint
  # ----------------
  ckpt_dir = os.path.join(exp_dir, 'ckpts')
  if not os.path.exists(ckpt_dir):
      os.makedirs(ckpt_dir)

  ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp.ckpt'),
                                                      monitor='val_loss',
                                                      mode='min', 
                                                      save_weights_only=True,
                                                      save_best_only=True,
                                                      verbose=0)  # False to save the model directly
  callbacks.append(ckpt_callback)


  # Early Stopping
  # --------------
  early_stop = True
  if early_stop:
      es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
      callbacks.append(es_callback)

  history = model.fit(x = train_generator,
          epochs=num_epochs,
          steps_per_epoch=math.ceil(len(train_dataframe)/bs),
          validation_data = val_generator,
          validation_steps=math.ceil(len(val_dataframe)/bs),
          callbacks=callbacks)
  
  minLoss = min(history.history['val_loss'])
  minLossIndex = history.history['val_loss'].index(minLoss)
  loss_arr.append(minLoss)
  
  # print metrics to file
  with open(os.path.join(exp_dir, 'historySplit' + str(loop_iteration) + '.txt'), 'w') as f:
    for key in history.history.keys():
      print(str(key), file=f)
      print(history.history[key], file=f)

  if not enable_kfold:
    break
  
  loop_iteration += 1

with open(os.path.join(exp_dir, 'cv_results' + '.txt'), 'w') as f2:
  print("avg loss = {}".format(np.mean(loss_arr)), file=f2)

## Prepare for testing
Once the generator is istantiated we make the prediction and realize the csv used for the submission.

In [None]:
test_gen = test_generator(test_dataset_frame, bs, img_h, img_w)
prediction = model.predict(test_gen, steps=math.ceil(len(test_dataset_frame)/bs))
predicted_class = np.argmax(prediction, axis=-1)
ciccia = math.ceil(len(test_dataset_frame)/bs) * bs - len(test_dataset_frame)
predicted_class = predicted_class[:-ciccia]

In [None]:
test_dict = {}
i = 0
for index in test_dataset_frame["index"]:
  test_dict[index] = predicted_class[i]
  i+=1 

### Build the csv file

In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

create_csv(test_dict, exp_dir)