### Import Libraries

In [1]:
import os
import json
import numpy as np
from tqdm import tqdm
import re
import cv2 as cv
import pickle

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import keras
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Embedding, \
    LSTM, Bidirectional, Lambda, concatenate, Add, Concatenate
from keras.layers.convolutional import Conv2D, MaxPooling2D, AveragePooling2D
from keras.layers.normalization import BatchNormalization, regularizers
from keras.optimizers import Adam, RMSprop

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Get Data Ready

In [30]:
# Unpickling JSON data

answers_pkl = open("answers.pkl","rb")
answers = pickle.load(answers_pkl)
answers_pkl.close()

questions_pkl = open("questions.pkl","rb")
questions = pickle.load(questions_pkl)
questions_pkl.close()

image_labels_pkl = open("image_labels.pkl","rb")
image_labels = pickle.load(image_labels_pkl)
image_labels_pkl.close()

groups_pkl = open("groups.pkl","rb")
groups = pickle.load(groups_pkl)
groups_pkl.close()


# Unpickling unique image labels and data

image_labels_unique_pkl = open("image_labels_unique.pkl","rb")
image_labels_unique = pickle.load(image_labels_unique_pkl)
image_labels_unique_pkl.close()

image_data_unique_pkl = open("image_data_unique.pkl","rb")
image_data_unique = pickle.load(image_data_unique_pkl)
image_data_unique_pkl.close()

In [31]:
train_images = []
train_questions = []
train_answers = []
train_groups = []

for idx, val in enumerate(tqdm(image_labels)):
    try:
        u = image_labels_unique.index(val + ".png")
        v = image_data_unique[u]
        train_images.append(v)
        train_questions.append(questions[idx])
        train_answers.append(answers[idx])
        train_groups.append(groups[idx])
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████| 135020/135020 [00:21<00:00, 6411.51it/s]


In [32]:
train_images = np.array(train_images)
train_images.shape

(135020, 60, 80, 3)

In [33]:
train_questions = np.array(train_questions)
train_questions.shape

(135020, 40)

In [34]:
unique_answers=list({i for i in train_answers})

In [35]:
group = {'blue': 0,
 '2': 2,
 'cylinder': 0,
 '5': 5,
 'cyan': 1,
 'purple': 2,
 '0': 0,
 'False': 0,
 'large': 0,
 '7': 7,
 'True': 1,
 'small': 1,
 '3': 3,
 'cube': 1,
 '8': 8,
 'sphere': 2,
 'metal': 0,
 'brown': 3,
 '4': 4,
 'red': 4,
 '6': 6,
 'gray': 5,
 '1': 1,
 'green': 6,
 'rubber': 1,
 'yellow': 7}

In [36]:
train_answers = [group[i] for i in train_answers]

In [37]:
test_images = train_images[:10000]
test_questions = train_questions[:10000]
test_answers = train_answers[:10000]
test_groups = train_groups[:10000]

In [38]:
answers[1]

'False'

In [39]:
train_images = train_images[10000:]
train_questions = train_questions[10000:]
train_answers = train_answers[10000:]
train_groups = train_groups[10000:]

### Model to get group from question

In [63]:
X = questions
new_dict = {
    'number': 0,
    'material': 1,
    'color': 2,
    'shape': 3,
    'size': 4,
    'exist': 5
}
y = [new_dict[i] for i in groups]
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(108016, 40) (108016, 6)
(27004, 40) (27004, 6)


In [64]:
embed_dim = 100
lstm_out = 128
batch_size = 32

inputs = Input((40, ))
x = Embedding(100, embed_dim)(inputs)
x = LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)(x)
x = Dense(6,activation='sigmoid')(x)
model = Model(inputs, x)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 40)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 40, 100)           10000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_91 (Dense)             (None, 6)                 774       
Total params: 128,022
Trainable params: 128,022
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['categorical_accuracy'])

In [69]:
early_stops = EarlyStopping(patience=3, monitor='val_acc')
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)

In [70]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size = batch_size, epochs = 3, callbacks=[checkpointer], verbose=2)

Train on 108016 samples, validate on 27004 samples
Epoch 1/3
 - 240s - loss: 0.0815 - categorical_accuracy: 0.9678 - val_loss: 0.0011 - val_categorical_accuracy: 1.0000

Epoch 00001: val_loss improved from inf to 0.00108, saving model to weights.hdf5
Epoch 2/3


KeyboardInterrupt: 

### Get group from question

In [None]:
questions_1 = ['There is a metal thing that is in front of the gray thing right of the big blue shiny sphere; how many rubber cubes are in front of it?']

In [None]:
# tokenizer_1 = Tokenizer(num_words=100, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
#                 lower=True, split=' ')

# tokenizer_1.fit_on_texts(questions_1)

tokenizer_pkl = open("tokenizer.pkl","rb")
tokenizer_1 = pickle.load(tokenizer_pkl)
tokenizer_pkl.close()

questions_tokenized_1 = tokenizer_1.texts_to_sequences(questions_1)
questions_padded_1 = pad_sequences(questions_tokenized_1, maxlen=40)

In [None]:
model = load_model('weights.hdf5')
groups_1 = model.predict(questions_padded_1)

In [None]:
groups_1 = np.argmax(groups_1, axis=1)

In [None]:
groups_1

In [None]:
for i in groups_1:
    for key, value in new_dict.items():
        if value == i:
            print(key)

### Split into groups

In [40]:
answer_types = ['number','material','color','shape','size','exist']

In [41]:
train_questions_grouped = {}
train_answers_grouped = {}
train_images_grouped = {}

test_questions_grouped = {}
test_answers_grouped = {}
test_images_grouped = {}

for group in answer_types:
    train_questions_grouped[group] = []
    train_answers_grouped[group] = []
    train_images_grouped[group] = []
    
    test_questions_grouped[group] = []
    test_answers_grouped[group] = []
    test_images_grouped[group] = []

In [42]:
for idx, value in enumerate(tqdm(train_groups)):
    train_questions_grouped[value].append(train_questions[idx])
    train_answers_grouped[value].append(train_answers[idx])
    train_images_grouped[value].append(train_images[idx])
    
for idx, value in enumerate(tqdm(test_groups)):
    test_questions_grouped[value].append(test_questions[idx])
    test_answers_grouped[value].append(test_answers[idx])
    test_images_grouped[value].append(test_images[idx])

100%|██████████████████████████████████████████████████████████████████████| 125020/125020 [00:00<00:00, 631418.09it/s]
100%|████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 666683.20it/s]


In [43]:
for key, val in train_answers_grouped.items():
    train_answers_grouped[key] = to_categorical(val)

for key, val in test_answers_grouped.items():
    test_answers_grouped[key] = to_categorical(val)

In [44]:
for key, val in train_questions_grouped.items():
    print(key, len(val))
    
for key, val in test_questions_grouped.items():
    print(key, len(val))

number 29457
material 11248
color 11250
shape 11250
size 11247
exist 50568
number 2356
material 900
color 900
shape 900
size 900
exist 4044


### Baseline Model

In [45]:
import keras
from keras.layers.convolutional import Conv2D

def ConvolutionNetworks(kernel_size=3, stride_size=2):
    def conv(model):
        model = Conv2D(24, (5, 5), strides=(stride_size, stride_size),activation='relu',input_shape=(60, 80, 3), data_format='channels_last')(model)
        model = BatchNormalization()(model)
        model = Conv2D(24, (5, 5), strides=(stride_size, stride_size),activation='relu')(model)
        model = BatchNormalization()(model)
        model = Conv2D(24, (kernel_size, kernel_size), strides=(stride_size, stride_size),activation='relu')(model)
        model = BatchNormalization()(model)
        model = Conv2D(24, (3, 3), strides=(1, 1),activation='relu')(model)
        model = BatchNormalization()(model)
        return model
    return conv

In [46]:
import numpy as np
import keras
from keras import backend as K
from keras.layers import Dense, Dropout, Activation, Flatten, Lambda, Concatenate, Add
from keras.layers.normalization import BatchNormalization

def slicer(x_loc, y_loc):
    def func(x):
        return x[:,x_loc,y_loc,:]
    return Lambda(func)

def object_tagging(o, i, d):
    coor = K.variable(value=[float(int(i/d))/d*2-1, float((i%d))/d*2-1])
    coor = K.expand_dims(coor, axis=0)
    batch_size = K.shape(o)[0]
    coor = K.tile(coor, [batch_size, 1])
    coor = Input(tensor=coor)
    o = Concatenate()([coor, o])
    return o
    
def compute_relations(objects, question):
    
    def get_top_dim_1(t):
        return t[:, 0, :, :]

    def get_all_but_top_dim_1(t):
        return t[:, 1:, :, :]

    def get_top_dim_2(t):
        return t[:, 0, :]

    def get_all_but_top_dim2(t):
        return t[:, 1:, :]
    
    slice_top_dim_1 = Lambda(get_top_dim_1)
    slice_all_but_top_dim_1 = Lambda(get_all_but_top_dim_1)
    slice_top_dim_2 = Lambda(get_top_dim_2)
    slice_all_but_top_dim2 = Lambda(get_all_but_top_dim2)
    
    d = K.int_shape(objects)[2]
    features = []
    for i in range(d):
        features1 = slice_top_dim_1(objects)
        objects = slice_all_but_top_dim_1(objects)
        for j in range(d):
            features2 = slice_top_dim_2(features1)
            features1 = slice_all_but_top_dim2(features1)
            features.append(features2)
    
    relations = []
    concat = Concatenate()
    for feature1 in features:
        for feature2 in features:
            relations.append(concat([feature1, feature2, question]))
    

    return relations

In [47]:
from keras.models import Model
from keras.layers import Input
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

def f_theta():
    def f(model):
        model = Dense(256)(model)
        model = Activation('relu')(model)
        model = Dense(256)(model)
        model = Activation('relu')(model)
        model = Dense(256)(model)
        model = Activation('relu')(model)
        model = Dense(256)(model)
        model = Activation('relu')(model)
        return model
    return f

baseline_scene = Input((60, 80, 3))
baseline_question = Input((40,))
baseline_conv = ConvolutionNetworks()(baseline_scene)
baseline_conv = Flatten()(baseline_conv)
baseline_conv = Concatenate()([baseline_conv, baseline_question])


In [48]:
answer_types

['number', 'material', 'color', 'shape', 'size', 'exist']

In [49]:
length_groups = {}
for key, val in train_answers_grouped.items():
    length_groups[key] = len(train_answers_grouped[key][0])
length_groups

{'number': 9, 'material': 2, 'color': 8, 'shape': 3, 'size': 2, 'exist': 2}

### Train Baseline Moodel

In [61]:
models_grouped = {}

for group in tqdm(answer_types):
    baseline_output = f_theta()(baseline_conv) 
    baseline_output = Dense(length_groups[group], activation='softmax')(baseline_output)
    baseline_model = Model(inputs=[baseline_scene, baseline_question], outputs=baseline_output)
    baseline_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    models_grouped[group] = baseline_model

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 10.43it/s]


In [62]:
trained_models_grouped = {}

for group in answer_types:
    print(group)
    early_stops = EarlyStopping(patience=2, monitor='val_acc')
    checkpointer = ModelCheckpoint(filepath=group+'.hdf5', verbose=1, save_best_only=True)
    trained_models_grouped[group] = models_grouped[group].fit([train_images_grouped[group], train_questions_grouped[group]], train_answers_grouped[group], validation_data=([test_images_grouped[group], test_questions_grouped[group]], test_answers_grouped[group]), epochs = 5, shuffle = True, batch_size = 50, callbacks=[checkpointer], verbose=2)

number
Train on 29457 samples, validate on 2356 samples
Epoch 1/5
 - 10s - loss: 1.4605 - acc: 0.3581 - val_loss: 1.4081 - val_acc: 0.3833

Epoch 00001: val_loss improved from inf to 1.40805, saving model to number.hdf5
Epoch 2/5
 - 8s - loss: 1.3737 - acc: 0.3804 - val_loss: 1.3810 - val_acc: 0.3820

Epoch 00002: val_loss improved from 1.40805 to 1.38099, saving model to number.hdf5
Epoch 3/5
 - 8s - loss: 1.3311 - acc: 0.3937 - val_loss: 1.3613 - val_acc: 0.3765

Epoch 00003: val_loss improved from 1.38099 to 1.36131, saving model to number.hdf5
Epoch 4/5
 - 8s - loss: 1.2902 - acc: 0.4076 - val_loss: 1.3730 - val_acc: 0.3782

Epoch 00004: val_loss did not improve from 1.36131
Epoch 5/5
 - 8s - loss: 1.2484 - acc: 0.4235 - val_loss: 1.3778 - val_acc: 0.3973

Epoch 00005: val_loss did not improve from 1.36131
material
Train on 11248 samples, validate on 900 samples
Epoch 1/5
 - 4s - loss: 0.7311 - acc: 0.4941 - val_loss: 0.6985 - val_acc: 0.4822

Epoch 00001: val_loss improved from in