In [88]:
import argparse
import csv
import numpy as np
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import recurrent
from keras.models import Graph, Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import cPickle as pickle
import theano.tensor as T
from theano import function

In [89]:
def mean(x, axis=None, keepdims=False):
    return T.mean(x, axis=axis, keepdims=keepdims)

def l2_normalize(x, axis):
    norm = T.sqrt(T.sum(T.square(x), axis=axis, keepdims=True))
    return x / norm

def cosine_similarity(y_true, y_pred):
    assert y_true.ndim == 2
    assert y_pred.ndim == 2
    y_true = l2_normalize(y_true, axis=1)
    y_pred = l2_normalize(y_pred, axis=1)
    return T.sum(y_true * y_pred, axis=1, keepdims=False)

def cosine_ranking_loss(y_true, y_pred):
    q = y_pred[0::3]
    a_correct = y_pred[1::3]
    a_incorrect = y_pred[2::3]

    return mean(T.maximum(0., args.margin - cosine_similarity(q, a_correct) + cosine_similarity(q, a_incorrect)) - y_true[0]*0, axis=-1)

In [90]:
def np_l2_normalize(x, axis):
    norm = np.sqrt(np.sum(np.square(x), axis=axis, keepdims=True))
    return x / norm

def np_cosine_similarity(y_true, y_pred):
    assert y_true.ndim == 2
    assert y_pred.ndim == 2
    y_true = np_l2_normalize(y_true, axis=1)
    y_pred = np_l2_normalize(y_pred, axis=1)
    return np.sum(y_true * y_pred, axis=1, keepdims=False)

In [194]:
parser = argparse.ArgumentParser()
parser.add_argument("model_path")
parser.add_argument("csv_file")
parser.add_argument("--write_predictions", default="predictions.csv")
parser.add_argument("--tokenizer", default="model/tokenizer.pkl")
parser.add_argument("--rnn", choices=["LSTM", "GRU"], default="GRU")
parser.add_argument("--embed_size", type=int, default=300)
parser.add_argument("--hidden_size", type=int, default=1024)
parser.add_argument("--layers", type=int, default=1)
parser.add_argument("--dropout", type=float, default=0)
parser.add_argument("--bidirectional", action='store_true', default=False)
parser.add_argument("--batch_size", type=int, default=300)
parser.add_argument("--maxlen", type=int)
parser.add_argument("--vocab_size", type=int)
parser.add_argument("--optimizer", choices=['adam', 'rmsprop'], default='adam')
parser.add_argument("--verbose", type=int, choices=[0, 1, 2], default=1)
parser.add_argument("--margin", type=float, default=0.01)
#args = parser.parse_args("model/simple_1400000.pkl data/training_set.tsv --vocab_size 107149".split())
#args = parser.parse_args("model/simple_reduced.hdf5 data/training_set.tsv --tokenizer model/tokenizer_reduced.pkl".split())
#args = parser.parse_args("model/simple_bidirectional.hdf5 data/training_set.tsv --vocab_size 107149 --bidirectional".split())
args = parser.parse_args("model/simple_margin_0.1.pkl data/training_set.tsv --vocab_size 107149".split())


In [195]:
print "Loading data..."
ids = []
questions = []
corrects = []
answersA = []
answersB = []
answersC = []
answersD = []
with open(args.csv_file) as f:
  reader = csv.reader(f, delimiter="\t", strict=True, quoting=csv.QUOTE_NONE)
  line = next(reader)  # ignore header
  is_train_set = (len(line) == 7)
  for line in reader:
    ids.append(line[0])
    questions.append(line[1])
    if is_train_set:
      corrects.append(line[2])
      answersA.append(line[3])
      answersB.append(line[4])
      answersC.append(line[5])
      answersD.append(line[6])
    else:
      answersA.append(line[2])
      answersB.append(line[3])
      answersC.append(line[4])
      answersD.append(line[5])
print "Questions: ", len(questions)
assert len(questions) == len(answersA) == len(answersB) == len(answersC) == len(answersD)
assert not is_train_set or len(corrects) == len(questions)

Loading data...
Questions:  2500


In [196]:
print "Sample question and answers:"
for i in xrange(3):
  print questions[i], "A:", answersA[i], "B:", answersB[i], "C:", answersC[i], "D:", answersD[i], "Correct: ", corrects[i] if is_train_set else '?'

Sample question and answers:
When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions? A: at the tissue level B: at the organ level C: at the system level D: at the cellular level Correct:  C
Which example describes a learned behavior in a dog? A: smelling the air for odors B: barking when disturbed C: sitting on command D: digging in soil Correct:  C
When two nuclei are combined into one nucleus, there is a slight change in mass and the release of a large amount of energy. What is this process called? A: conversion B: reaction C: fission D: fusion Correct:  D


In [197]:
texts = questions + answersA + answersB + answersC + answersD
print "Texts size:", len(texts)

Texts size: 12500


In [198]:
tokenizer = pickle.load(open(args.tokenizer, "rb"))
sequences = tokenizer.texts_to_sequences(texts)

In [199]:
if args.maxlen:
  maxlen = args.maxlen
else:
  maxlen = max([len(s) for s in sequences])
print "Sequences maxlen:", maxlen

Sequences maxlen: 179


In [200]:
texts = pad_sequences(sequences, maxlen=maxlen) 

In [201]:
vocab_size = tokenizer.nb_words if tokenizer.nb_words else len(tokenizer.word_index)+1
if args.vocab_size:
  print "Overriding original vocabulary size", vocab_size
  vocab_size = args.vocab_size
print "Vocabulary size:", vocab_size, "Texts: ", texts.shape

Overriding original vocabulary size 107149
Vocabulary size: 107149 Texts:  (12500, 179)


In [202]:
if args.rnn == 'GRU':
  RNN = recurrent.GRU
elif args.rnn == 'LSTM':
  RNN = recurrent.LSTM
else:
  assert False, "Invalid RNN"

In [203]:
print "Creating model..."

if args.bidirectional:
  model = Graph()
  model.add_input(name="input", batch_input_shape=(args.batch_size,)+texts.shape[1:], dtype="uint")
  model.add_node(Embedding(vocab_size, args.embed_size, mask_zero=True), name="embed", input='input')
  for i in xrange(args.layers):
    model.add_node(RNN(args.hidden_size, return_sequences=False if i + 1 == args.layers else True), 
        name='forward'+str(i+1), 
        input='embed' if i == 0 else 'dropout'+str(i) if args.dropout > 0 else None, 
        inputs=['forward'+str(i), 'backward'+str(i)] if i > 0 and args.dropout == 0 else [])
    model.add_node(RNN(args.hidden_size, return_sequences=False if i + 1 == args.layers else True, go_backwards=True), 
        name='backward'+str(i+1), 
        input='embed' if i == 0 else 'dropout'+str(i) if args.dropout > 0 else None, 
        inputs=['forward'+str(i), 'backward'+str(i)] if i > 0 and args.dropout == 0 else [])
    if args.dropout > 0:
      model.add_node(Dropout(args.dropout), name='dropout'+str(i+1), inputs=['forward'+str(i+1), 'backward'+str(i+1)])
  model.add_output(name='output',
      input='dropout'+str(args.layers) if args.dropout > 0 else None,
      inputs=['forward'+str(args.layers), 'backward'+str(args.layers)] if args.dropout == 0 else [])
else:
  model = Sequential()
  model.add(Embedding(vocab_size, args.embed_size, mask_zero=True))
  for i in xrange(args.layers):
    model.add(RNN(args.hidden_size, return_sequences=False if i + 1 == args.layers else True))
    if args.dropout > 0:
      model.add(Dropout(args.dropout))

model.summary()

Creating model...
--------------------------------------------------------------------------------
Initial input shape: (None, 107149)
--------------------------------------------------------------------------------
Layer (name)                  Output Shape                  Param #             
--------------------------------------------------------------------------------
Embedding (embedding)         (None, None, 300)             32144700            
GRU (gru)                     (None, 1024)                  4070400             
--------------------------------------------------------------------------------
Total params: 36215100
--------------------------------------------------------------------------------


In [204]:
print "Loading weights from %s" % args.model_path
model.load_weights(args.model_path)

Loading weights from model/simple_margin_0.1.pkl


In [205]:
print "Compiling model..."
if args.bidirectional:
  model.compile(optimizer=args.optimizer, loss={'output': cosine_ranking_loss})
else:
  model.compile(optimizer=args.optimizer, loss=cosine_ranking_loss)

Compiling model...


In [187]:
if args.bidirectional:
  pred = model.predict({'input': texts}, batch_size=args.batch_size, verbose=args.verbose)
  pred = pred['output']
else:
  pred = model.predict(texts, batch_size=args.batch_size, verbose=args.verbose)

print "Predictions: ", pred.shape

Predictions:  (12500, 1024)


In [188]:
print pred[0,:10]
print pred[1,:10]
print pred[2,:10]
print pred[3,:10]
print pred[4,:10]

[  1.25983004e-02   9.99998748e-01   7.18231720e-04   9.99998808e-01
   9.99973297e-01   9.54918284e-03   6.92471396e-03   5.97136654e-02
   9.53765333e-01   9.71550703e-01]
[  1.03059423e-03   9.99998748e-01   1.34690187e-03   7.03501690e-04
   9.98515308e-01   3.50946118e-03   2.06653844e-03   4.07464649e-05
   1.53134271e-04   1.77163437e-01]
[  7.73918640e-04   9.99998748e-01   2.68034782e-04   1.00000000e+00
   9.99997973e-01   3.82178766e-03   7.41665135e-04   9.82717514e-01
   4.83897060e-01   4.94618714e-02]
[  7.77362438e-04   9.99998748e-01   2.30792630e-03   2.35250286e-06
   9.99507427e-01   3.41042131e-01   7.54320179e-04   5.92470288e-01
   1.68929546e-04   9.72303152e-01]
[ 0.01265316  0.99999875  0.0024676   1.          0.99997342  0.00508639
  0.01209639  0.65023321  0.95768094  0.85449421]


In [189]:
questions = pred[0::5]
answersA = pred[1::5]
answersB = pred[2::5]
answersC = pred[3::5]
answersD = pred[4::5]
print questions.shape, answersA.shape, answersB.shape, answersC.shape, answersD.shape

(2500, 1024) (2500, 1024) (2500, 1024) (2500, 1024) (2500, 1024)


In [190]:
sims = np.array([
  np_cosine_similarity(questions, answersA),
  np_cosine_similarity(questions, answersB),
  np_cosine_similarity(questions, answersC),
  np_cosine_similarity(questions, answersD)
])
print sims.shape
print sims[:,0]
print sims[:,1]
print sims[:,2]

(4, 2500)
[ 0.86860622  0.91262216  0.85996873  0.93499566]
[ 0.91710509  0.90115706  0.90401681  0.89186482]
[ 0.91669979  0.89054291  0.93537557  0.92411741]


In [191]:
preds = np.argmax(sims, axis=0)
print preds.shape
print preds[:3]

(2500,)
[3 0 2]


In [192]:
preds = [chr(ord('A') + p) for p in preds]
preds[:3]

['D', 'A', 'C']

In [193]:
if is_train_set:
  correct = sum([corrects[i] == p for i,p in enumerate(preds)])
  print "Correct: %d Total: %d Accuracy: %f" % (correct, len(preds), float(correct) / len(preds))

Correct: 631 Total: 2500 Accuracy: 0.252400
