### Import Libraries

In [2]:
import os
import json
import numpy as np
from tqdm import tqdm
import re
import cv2 as cv
import pickle

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import keras
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import load_model, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Embedding, \
    LSTM, Bidirectional, Lambda, concatenate, Add, Concatenate
from keras.layers.convolutional import Conv2D, MaxPooling2D, AveragePooling2D
from keras.layers.normalization import BatchNormalization, regularizers
from keras.optimizers import Adam, RMSprop

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Get Data Ready

In [3]:
# Unpickling JSON data

answers_pkl = open("answers.pkl","rb")
answers = pickle.load(answers_pkl)
answers_pkl.close()

questions_pkl = open("questions.pkl","rb")
questions = pickle.load(questions_pkl)
questions_pkl.close()

image_labels_pkl = open("image_labels.pkl","rb")
image_labels = pickle.load(image_labels_pkl)
image_labels_pkl.close()

# Unpickling unique image labels and data

image_labels_unique_pkl = open("image_labels_unique.pkl","rb")
image_labels_unique = pickle.load(image_labels_unique_pkl)
image_labels_unique_pkl.close()

image_data_unique_pkl = open("image_data_unique.pkl","rb")
image_data_unique = pickle.load(image_data_unique_pkl)
image_data_unique_pkl.close()

In [4]:
train_images = []
train_questions = []
train_answers = []

for idx, val in enumerate(tqdm(image_labels)):
    try:
        u = image_labels_unique.index(val + ".png")
        v = image_data_unique[u]
        train_images.append(v)
        train_questions.append(questions[idx])
        train_answers.append(answers[idx])
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████| 135020/135020 [00:18<00:00, 7328.09it/s]


In [5]:
train_images = np.array(train_images)
train_images.shape

(135020, 60, 80, 3)

In [6]:
train_questions = np.array(train_questions)
train_questions.shape

(135020, 40)

In [7]:
unique_answers=list({i for i in train_answers})

In [8]:
group = {}
for i in range(len(unique_answers)):
    group[unique_answers[i]] = i
group

{'1': 0,
 'large': 1,
 'gray': 2,
 '6': 3,
 'cylinder': 4,
 '5': 5,
 'rubber': 6,
 '4': 7,
 '3': 8,
 'False': 9,
 'brown': 10,
 '0': 11,
 'small': 12,
 'yellow': 13,
 'cyan': 14,
 'purple': 15,
 '7': 16,
 'sphere': 17,
 'red': 18,
 'cube': 19,
 'green': 20,
 '8': 21,
 'True': 22,
 'blue': 23,
 'metal': 24,
 '2': 25}

In [9]:
train_answers = [group[i] for i in train_answers]
#train_answers = to_categorical(train_answers)

In [10]:
test_images = train_images[:20000]
test_questions = train_questions[:20000]
test_answers = train_answers[:20000]

In [11]:
train_images = train_images[20000:]
train_questions = train_questions[20000:]
train_answers = train_answers[20000:]

### Model 2

In [11]:
def _cnn(kernel_size=3, stride_size=2):
    def conv(model):
        model = Conv2D(24, (5, 5), strides=(stride_size, stride_size),activation='relu',input_shape=(120, 160, 3), data_format='channels_last')(model)
        model = BatchNormalization()(model)
        model = Conv2D(24, (5, 5), strides=(stride_size, stride_size),activation='relu')(model)
        model = BatchNormalization()(model)
        model = Conv2D(24, (kernel_size, kernel_size), strides=(stride_size, stride_size),activation='relu')(model)
        model = BatchNormalization()(model)
        model = Conv2D(24, (3, 3), strides=(1, 1),activation='relu')(model)
        model = BatchNormalization()(model)
        return model
    return conv

In [12]:
def _lstm(kernel_size=3, stride_size=2):
    def f(question_input): 
        model = Embedding(100, 100)(question_input)
        model = LSTM(128,return_sequences = True)(model)
        model = Dropout(0.2)(model)
        model = LSTM(128, return_sequences = False)(model)
        model = Dropout(0.2)(model)
        return model
    return f

In [13]:
def _mlp():
    def f(model):
        model = Dense(256)(model)
        model = Activation('relu')(model)
        model = Dense(256)(model)
        model = Activation('relu')(model)
        model = Dense(256)(model)
        model = Activation('relu')(model)
        model = Dense(256)(model)
        model = Activation('relu')(model)
        return model
    return f

model2_scene = Input((60, 80, 3))
model2_question = Input((40,))
model2_lstm = _lstm()(model2_question)
model2_conv = _cnn()(model2_scene)
model2_conv = Flatten()(model2_conv)
model2_conv = Concatenate()([model2_conv, model2_lstm])

In [14]:
model2_output = _mlp()(model2_conv) 
model2_output = Dense(len(unique_answers), activation='softmax')(model2_output)
model2 = Model(inputs=[model2_scene, model2_question], outputs=model2_output)
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
early_stops = EarlyStopping(patience=2, monitor='val_acc')
checkpointer = ModelCheckpoint(filepath='model2.hdf5', verbose=1, save_best_only=True)

In [16]:
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 60, 80, 3)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 28, 38, 24)   1824        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 28, 38, 24)   96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 12, 17, 24)   14424       batch_normalization_1[0][0]      
__________________________________________________________________________________________________
batch_norm

In [17]:
model2.fit([train_images, train_questions], to_categorical(train_answers),validation_data=([test_images, test_questions], to_categorical(test_answers)), epochs = 10, shuffle = True, batch_size = 100, callbacks=[checkpointer], verbose=2)

Train on 115020 samples, validate on 20000 samples
Epoch 1/10
 - 164s - loss: 1.1957 - acc: 0.4051 - val_loss: 0.9966 - val_acc: 0.4305

Epoch 00001: val_loss improved from inf to 0.99657, saving model to model2.hdf5
Epoch 2/10
 - 161s - loss: 0.9955 - acc: 0.4414 - val_loss: 0.9854 - val_acc: 0.4511

Epoch 00002: val_loss improved from 0.99657 to 0.98537, saving model to model2.hdf5
Epoch 3/10
 - 162s - loss: 0.9832 - acc: 0.4479 - val_loss: 0.9741 - val_acc: 0.4551

Epoch 00003: val_loss improved from 0.98537 to 0.97412, saving model to model2.hdf5
Epoch 4/10
 - 161s - loss: 0.9604 - acc: 0.4615 - val_loss: 0.9666 - val_acc: 0.4669

Epoch 00004: val_loss improved from 0.97412 to 0.96662, saving model to model2.hdf5
Epoch 5/10
 - 162s - loss: 0.9575 - acc: 0.4645 - val_loss: 0.9611 - val_acc: 0.4654

Epoch 00005: val_loss improved from 0.96662 to 0.96111, saving model to model2.hdf5
Epoch 6/10
 - 167s - loss: 0.9382 - acc: 0.4782 - val_loss: 0.9659 - val_acc: 0.4603

Epoch 00006: val_

<keras.callbacks.History at 0x153271ad630>

### Model 1

In [12]:
question_input = Input((40, ))

q = Embedding(100, 100)(question_input)
q = LSTM(128,return_sequences = True)(q)
q = Dropout(0.2)(q)
q = LSTM(128,return_sequences = True)(q)
q = Dropout(0.2)(q)
q = LSTM(128, return_sequences = False)(q)
q = Dropout(0.2)(q)

In [13]:
image_input = Input(shape=(60, 80, 3))

i = Sequential()
i.add(Conv2D(24, (5, 5)))
i.add(Activation('relu'))
i.add(MaxPooling2D(pool_size=(2, 2)))

i.add(Conv2D(24, (5, 5)))
i.add(Activation('relu'))
i.add(MaxPooling2D(pool_size=(2, 2)))

i.add(Conv2D(24, (5, 5)))
i.add(Activation('relu'))
i.add(MaxPooling2D(pool_size=(2, 2)))

i.add(Flatten())
i.add(Dense(64))
i.add(Activation('relu'))
i.add(Dropout(0.5))
i.add(Dense(1))
i.add(Activation('sigmoid'))

i = i(image_input)

In [14]:
merged = concatenate([i, q])

In [15]:
model1_output = Dense(len(unique_answers), activation='sigmoid')(merged)
model1 = Model(inputs=[image_input, question_input], outputs=model1_output)

In [16]:
model1.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics = ['accuracy'])

In [17]:
early_stops = EarlyStopping(patience=2, monitor='val_acc')
checkpointer = ModelCheckpoint(filepath='model11.hdf5', verbose=1, save_best_only=True)

In [18]:
model1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 40, 100)      10000       input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 40, 128)      117248      embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 40, 128)      0           lstm_1[0][0]                     
__________________________________________________________________________________________________
lstm_2 (LS

In [19]:
model1.fit(x = [train_images, train_questions],y = train_answers, validation_data=([test_images, test_questions], test_answers), epochs = 10, shuffle = True, batch_size = 100, callbacks=[checkpointer], verbose=2)

Train on 115020 samples, validate on 20000 samples
Epoch 1/10
 - 235s - loss: 1.5589 - acc: 0.3059 - val_loss: 1.1815 - val_acc: 0.4147

Epoch 00001: val_loss improved from inf to 1.18152, saving model to model11.hdf5
Epoch 2/10
 - 231s - loss: 1.0331 - acc: 0.4241 - val_loss: 0.9921 - val_acc: 0.4358

Epoch 00002: val_loss improved from 1.18152 to 0.99205, saving model to model11.hdf5
Epoch 3/10
 - 231s - loss: 0.9951 - acc: 0.4336 - val_loss: 0.9894 - val_acc: 0.4442

Epoch 00003: val_loss improved from 0.99205 to 0.98941, saving model to model11.hdf5
Epoch 4/10
 - 230s - loss: 0.9858 - acc: 0.4417 - val_loss: 1.0071 - val_acc: 0.4452

Epoch 00004: val_loss did not improve from 0.98941
Epoch 5/10
 - 230s - loss: 0.9833 - acc: 0.4474 - val_loss: 0.9738 - val_acc: 0.4530

Epoch 00005: val_loss improved from 0.98941 to 0.97380, saving model to model11.hdf5
Epoch 6/10
 - 232s - loss: 0.9756 - acc: 0.4538 - val_loss: 0.9832 - val_acc: 0.4591

Epoch 00006: val_loss did not improve from 0.9

<keras.callbacks.History at 0x1cd82707860>