In [1]:
import numpy as np
import re, random

# Read Dataset with Questions and their Labels
Here, the training and testing sets are read into memory and combined. This way it is possible to decide on a training, validation and test set manually by selecting a subsection of the data (In our case 5% for validation and 10% for testing, but we will do this later). Also, to fix the problem of overly large samples (outliers) we can cap the length of the samples used to a maximum of 96 characters.

In [2]:
def read_data(paths=['cocoqa/train', 'cocoqa/test']):
    questions, labels = [], []
    for path in paths:
        with open(path+'/questions.txt', 'r') as f:
            for line in f.readlines():
                line = re.sub('[^A-Za-z0-9\!\,\.\?\$\&\s\']', '', line)
                questions.append(line.strip())
        with open(path+'/types.txt', 'r') as f:
            for line in f.readlines():
                labels.append(int(line.strip()))
    return questions, labels
            
X, Y = read_data()
print('In total', len(X), 'questions found')

In total 117684 questions found


In [3]:
# shuffle dataset
Z = list(zip(X, Y))
random.shuffle(Z)
X, Y = zip(*Z)

In [4]:
# remove samples that are very long (i.e. longer than 96 characters)
max_len = 96
X, Y = zip(*[(x, y) for x,y in zip(X, Y) if len(x) <= max_len])
print('Reduced to', len(X), 'questions\n')

Reduced to 116986 questions



In [5]:
# balance dataset
count = min([Y.count(y) for y in set(Y)])
label_counts = {y:count for y in set(Y)}
X2, Y2 = [], []
for x, y in zip(X, Y):
    if label_counts[y] > 0:
        X2.append(x)
        Y2.append(y)
        label_counts[y] -= 1
X, Y = X2, Y2

print('Reduced to', len(X), 'questions\n')

Reduced to 28988 questions



In [6]:
for x, y in zip(X[:20], Y[:20]):
    print(y, ' ', x)

0   what hangs over the large couch covered with pillows
0   what is riding the skate board
3   where does the bullet train slow down
0   what are sitting on the grass
1   how many giraffes in confinement stand under a tree in their enclosure
0   what is shown with the small salad
1   how many giraffe standing in the grass near each other
2   what is the color of the donuts
0   what is snuggling near his mother
0   what located within the train station with people
0   what is wearing the pink outfit is laying on the bed next to a book
2   what is the color of the couch
2   what is the color of the top
0   what is sitting in a living room next to furniture
0   what is the man laughing as an elephant removes
0   what is standing in the snow
0   what are sitting on the ground
3   where is the woman drinking a beer
0   what are on the wall in a room
3   where are the man and woman preparing food


# Make Characters Categorical
We cannot insert a string into a neural network. Therefore, the string is converted to a matrix of (vocab_size, question_length). To this end, it is necessary to determine the character vocabulary and a mapping from character to integer index.

In [7]:
char2int, int2char = dict(), dict()
for question in X:
    for ch in question:
        if ch not in char2int:
            n = len(char2int)
            char2int[ch] = n
            int2char[n] = ch
print(char2int, '\n')

int2labels = {0:'object', 1:'number', 2:'color', 3:'location'}
print(int2labels)

{'2': 33, '3': 36, 'j': 28, 'z': 24, 'm': 20, 'g': 6, '1': 35, ',': 27, '&': 37, '7': 42, '.': 25, 'c': 13, '9': 30, 'r': 11, "'": 26, 'u': 14, 'i': 16, '6': 38, 'h': 1, 'q': 29, 'x': 23, 'e': 10, 'd': 15, 'n': 5, '5': 31, 't': 3, '$': 40, '8': 39, '0': 34, 'p': 17, 'y': 21, 'f': 22, 'k': 18, 'w': 0, ' ': 4, '!': 41, 'o': 8, 'a': 2, 's': 7, 'l': 12, 'b': 19, '4': 32, 'v': 9} 

{0: 'object', 1: 'number', 2: 'color', 3: 'location'}


In [8]:
def to_categorical(questions, labels):
    X = np.zeros((len(questions), max_len, len(char2int)), dtype=np.uint8)
    Y = np.zeros((len(questions), len(int2labels)), dtype=np.uint8)
    for i, question in enumerate(questions):
        for j, ch in enumerate(question):
            k = char2int[ch]
            X[i][j][k] = 1
    for i, l in enumerate(labels):
        Y[i][l] = 1
    return X, Y

X, Y = to_categorical(X, Y)

In [9]:
k = int(X.shape[0]*0.05)
trainX, trainY = X[:-3*k], Y[:-3*k]
validX, validY = X[-3*k:-2*k], Y[-3*k:-2*k]
testX, testY = X[-2*k:], Y[-2*k:]

print('Training shapes:', trainX.shape, trainY.shape) # 85%
print('Validation shapes:', validX.shape, validY.shape) # 5%
print('Testing shapes:', testX.shape, testY.shape) # 10%

Training shapes: (24641, 96, 43) (24641, 4)
Validation shapes: (1449, 96, 43) (1449, 4)
Testing shapes: (2898, 96, 43) (2898, 4)


# Train Model

In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [11]:
# simple model without hidden layers!
model = Sequential()
model.add(Flatten(input_shape=trainX.shape[1:]))
model.add(Dense(4, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=1e-5), metrics=['accuracy'])
model.fit(trainX, trainY, validation_data=(validX, validY), epochs=8, batch_size=8)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4128)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 16516     
Total params: 16,516
Trainable params: 16,516
Non-trainable params: 0
_________________________________________________________________
Train on 24641 samples, validate on 1449 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7ff0d75f6358>

In [14]:
correct = 0
for pred, y in zip(model.predict(testX), testY):
    correct += (np.argmax(pred) == np.argmax(y))
print('Accuracy = {}'.format(correct / testY.shape[0]))

Accuracy = 0.989648033126294


# Manual Testing

In [15]:
def predict_answer_type(query):
    inputs = np.zeros((1, max_len, len(char2int)), dtype=np.uint8)
    for i, ch in enumerate(query):
        inputs[0][i][char2int[ch]] = 1

    pred = np.argmax(model.predict(inputs))
    print(query, '->', int2labels[pred])
    
predict_answer_type('where is amsterdam')
predict_answer_type('how many dogs are there')
predict_answer_type('what is a dog')
predict_answer_type('what is the color')

where is amsterdam -> location
how many dogs are there -> number
what is a dog -> object
what is the color -> color
