In [1]:
import numpy as np
import re, random

# Read Dataset with Questions and their Labels
Here, the training and testing sets are read into memory and combined. This way it is possible to decide on a training, validation and test set manually by selecting a subsection of the data (In our case 5% for validation and 10% for testing, but we will do this later). Also, to fix the problem of overly large samples (outliers) we can cap the length of the samples used to a maximum of 96 characters.

In [2]:
def read_data(paths=['cocoqa/train', 'cocoqa/test']):
    questions, labels = [], []
    for path in paths:
        with open(path+'/questions.txt', 'r') as f:
            for line in f.readlines():
                line = re.sub('[^A-Za-z0-9\!\,\.\?\$\&\s]', '', line)
                questions.append(line.strip())
        with open(path+'/types.txt', 'r') as f:
            for line in f.readlines():
                labels.append(int(line.strip()))
    return questions, labels
            
X, Y = read_data()
print('In total', len(X), 'questions found')

In total 117684 questions found


In [3]:
# shuffle dataset
Z = list(zip(X, Y))
random.shuffle(Z)
X, Y = zip(*Z)

In [4]:
# remove samples that are very long (i.e. longer than 96 characters)
max_len = 96
X, Y = zip(*[(x, y) for x,y in zip(X, Y) if len(x) <= max_len])
print('Reduced to', len(X), 'questions\n')

Reduced to 116990 questions



In [5]:
# balance dataset
count = min([Y.count(y) for y in set(Y)])
label_counts = {y:count for y in set(Y)}
X2, Y2 = [], []
for x, y in zip(X, Y):
    if label_counts[y] > 0:
        X2.append(x)
        Y2.append(y)
        label_counts[y] -= 1
X, Y = X2, Y2

print('Reduced to', len(X), 'questions\n')

Reduced to 28988 questions



In [6]:
for x, y in zip(X[:20], Y[:20]):
    print(y, ' ', x)

2   what is the color of the piece
0   what did the whole freshly make in a restaurant
2   what is the color of the birds
0   what and two bananas next to each other
0   what filled with meat , vegetables and broth
0   what is laying down in the field
0   what is sitting above a building
0   what is traveling down tracks surrounded by forest
2   what is the color of the dog
0   glazed what coming off a conveyor belt after being reviewed by workers
0   what is the black man taking
0   what is shown on the side walk
0   what looks shiny clean and nice
0   what is being flown in a public park
3   where is the worker checking her phone
2   what is the color of the jacket
2   what is the color of the appliances
0   what were there sitting in boxes on top of our stove
0   wooden what on bricked area near shrubs
0   what is in the sky as several are parked on a runway


# Make Characters Categorical
We cannot insert a string into a neural network. Therefore, the string is converted to a matrix of (vocab_size, question_length). To this end, it is necessary to determine the character vocabulary and a mapping from character to integer index.

In [7]:
char2int, int2char = dict(), dict()
for question in X:
    for ch in question:
        if ch not in char2int:
            n = len(char2int)
            char2int[ch] = n
            int2char[n] = ch
print(char2int, '\n')

int2labels = {0:'object', 1:'number', 2:'color', 3:'location'}
print(int2labels)

{'j': 26, 'e': 7, 'c': 8, 'h': 1, 'k': 17, '9': 34, '!': 41, '4': 35, 'i': 5, '2': 32, 'r': 11, '7': 38, '$': 39, 'q': 27, 'x': 21, '8': 40, 'g': 24, ' ': 4, 'a': 2, '.': 29, 'u': 19, '1': 33, 's': 6, 'z': 25, 'b': 20, 'l': 10, '0': 31, 'm': 16, 'y': 15, 'o': 9, 'p': 13, 'v': 23, 'w': 0, 'd': 14, '6': 36, 't': 3, ',': 22, 'n': 18, '&': 37, '5': 30, 'f': 12, '3': 28} 

{0: 'object', 1: 'number', 2: 'color', 3: 'location'}


In [8]:
def to_categorical(questions, labels):
    X = np.zeros((len(questions), max_len, len(char2int)), dtype=np.uint8)
    Y = np.zeros((len(questions), len(int2labels)), dtype=np.uint8)
    for i, question in enumerate(questions):
        for j, ch in enumerate(question):
            k = char2int[ch]
            X[i][j][k] = 1
    for i, l in enumerate(labels):
        Y[i][l] = 1
    return X, Y

X, Y = to_categorical(X, Y)

In [9]:
k = int(X.shape[0]*0.05)
trainX, trainY = X[:-3*k], Y[:-3*k]
validX, validY = X[-3*k:-2*k], Y[-3*k:-2*k]
testX, testY = X[-2*k:], Y[-2*k:]

print('Training shapes:', trainX.shape, trainY.shape) # 85%
print('Validation shapes:', validX.shape, validY.shape) # 5%
print('Testing shapes:', testX.shape, testY.shape) # 10%

Training shapes: (24641, 96, 42) (24641, 4)
Validation shapes: (1449, 96, 42) (1449, 4)
Testing shapes: (2898, 96, 42) (2898, 4)


# Train Model

In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [11]:
# simple model without hidden layers!
model = Sequential()
model.add(Flatten(input_shape=trainX.shape[1:]))
model.add(Dense(4, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=1e-4), metrics=['accuracy'])
model.fit(trainX, trainY, validation_data=(validX, validY), epochs=5, batch_size=8)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4032)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 16132     
Total params: 16,132
Trainable params: 16,132
Non-trainable params: 0
_________________________________________________________________
Train on 24641 samples, validate on 1449 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0ced883390>

In [12]:
correct = 0
for pred, y in zip(model.predict(testX), testY):
    correct += (np.argmax(pred) == np.argmax(y))
print('Accuracy = {}'.format(correct / testY.shape[0]))

Accuracy = 0.9875776397515528


# Manual Testing

In [18]:
def predict_answer_type(query):
    inputs = np.zeros((1, max_len, len(char2int)), dtype=np.uint8)
    for i, ch in enumerate(query):
        inputs[0][i][char2int[ch]] = 1

    pred = np.argmax(model.predict(inputs))
    print(query, '->', int2labels[pred])
    
predict_answer_type('where is amsterdam')
predict_answer_type('how many dogs are there')
predict_answer_type('what is a dog')
predict_answer_type('what is the color of a dog')

where is amsterdam -> location
how many dogs are there -> number
what is a dog -> object
what is the color of a dog -> color
