In [17]:
import argparse
import cPickle as pickle
import numpy as np
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import recurrent
from keras.models import Graph, Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import theano.tensor as T
from theano import function
from itertools import islice
from keras.preprocessing.sequence import pad_sequences

In [2]:
def mean(x, axis=None, keepdims=False):
    return T.mean(x, axis=axis, keepdims=keepdims)

def l2_normalize(x, axis):
    norm = T.sqrt(T.sum(T.square(x), axis=axis, keepdims=True))
    return x / norm

def cosine_similarity(y_true, y_pred):
    assert y_true.ndim == 2
    assert y_pred.ndim == 2
    y_true = l2_normalize(y_true, axis=1)
    y_pred = l2_normalize(y_pred, axis=1)
    return T.sum(y_true * y_pred, axis=1, keepdims=False)

def cosine_ranking_loss(y_true, y_pred):
    q = y_pred[0::3]
    a_correct = y_pred[1::3]
    a_incorrect = y_pred[2::3]

    return mean(T.maximum(0., args.margin - cosine_similarity(q, a_correct) + cosine_similarity(q, a_incorrect)) - y_true[0]*0, axis=-1)

In [3]:
def generate_sequences(data_path, tokenizer, bidirectional):
  while 1:
    with open(data_path) as f:
      for lines in islice(f, args.batch_size / 3):
        print "Lines:", len(lines)
        sequences = tokenizer.texts_to_sequences(lines)
        print "Sequences:", len(sequences)
        x = pad_sequences(sequences, maxlen=args.maxlen)
        y = np.empty((x.shape[0], args.hidden_size))
        print "X,y:", x.shape, y.shape
        if bidirectional:
          yield {'input': x, 'output': y}
        else:
          yield x, y

In [19]:
parser = argparse.ArgumentParser()
parser.add_argument("model_path")
parser.add_argument("weights_path")
parser.add_argument("history_path")
parser.add_argument("--data_path", default="/storage/hpc_tanel/allenAI/X_studystack_qa_cleaner_ranking_shuffled.txt")
parser.add_argument("--tokenizer_path", default="model/tokenizer_studystack_full.pkl")
parser.add_argument("--maxlen", type=int)
parser.add_argument("--rnn", choices=["LSTM", "GRU"], default="GRU")
parser.add_argument("--embed_size", type=int, default=300)
parser.add_argument("--hidden_size", type=int, default=1024)
parser.add_argument("--layers", type=int, default=1)
parser.add_argument("--dropout", type=float, default=0)
parser.add_argument("--bidirectional", action='store_true', default=False)
parser.add_argument("--batch_size", type=int, default=300)
parser.add_argument("--samples_per_epoch", type=int, default=1000000)
parser.add_argument("--epochs", type=int, default=100)
parser.add_argument("--validation_split", type=float, default=0)
parser.add_argument("--optimizer", choices=['adam', 'rmsprop'], default='adam')
#parser.add_argument("--patience", type=int, default=10)
parser.add_argument("--verbose", type=int, choices=[0, 1, 2], default=1)
parser.add_argument("--margin", type=float, default=0.1)
parser.add_argument("--dense_layers", type=int, default=0)
parser.add_argument("--dense_activation", choices=['relu','sigmoid','tanh'], default='relu')
args = parser.parse_args("model/reference.json model/studystack_reference.hdf5 model/studystack_reference.pkl".split())

assert args.batch_size % 3 == 0, "Batch size must be multiple of 3"

In [6]:
print "Loading tokenizer..."
tokenizer = pickle.load(open(args.tokenizer_path, "rb"))
vocab_size = tokenizer.nb_words+1 if tokenizer.nb_words else len(tokenizer.word_index)+1

Loading tokenizer...


In [7]:
if args.rnn == 'GRU':
  RNN = recurrent.GRU
elif args.rnn == 'LSTM':
  RNN = recurrent.LSTM
else:
  assert False, "Invalid RNN"

In [8]:
print "Creating model..."

if args.bidirectional:
  model = Graph()
  model.add_input(name="input", batch_input_shape=(args.batch_size,)+texts.shape[1:], dtype="uint")
  model.add_node(Embedding(vocab_size, args.embed_size, mask_zero=True), name="embed", input='input')
  for i in xrange(args.layers):
    model.add_node(RNN(args.hidden_size, return_sequences=False if i + 1 == args.layers else True), 
        name='forward'+str(i+1), 
        input='embed' if i == 0 else 'dropout'+str(i) if args.dropout > 0 else None, 
        inputs=['forward'+str(i), 'backward'+str(i)] if i > 0 and args.dropout == 0 else [])
    model.add_node(RNN(args.hidden_size, return_sequences=False if i + 1 == args.layers else True, go_backwards=True), 
        name='backward'+str(i+1), 
        input='embed' if i == 0 else 'dropout'+str(i) if args.dropout > 0 else None, 
        inputs=['forward'+str(i), 'backward'+str(i)] if i > 0 and args.dropout == 0 else [])
    if args.dropout > 0:
      model.add_node(Dropout(args.dropout), name='dropout'+str(i+1), inputs=['forward'+str(i+1), 'backward'+str(i+1)])
  model.add_output(name='output',
      input='dropout'+str(args.layers) if args.dropout > 0 else None,
      inputs=['forward'+str(args.layers), 'backward'+str(args.layers)] if args.dropout == 0 else [])
  assert args.dense_layers == 0, "Bidirectional model doesn't support dense layers yet"
else:
  model = Sequential()
  model.add(Embedding(vocab_size, args.embed_size, mask_zero=True))
  for i in xrange(args.layers):
    model.add(RNN(args.hidden_size, return_sequences=False if i + 1 == args.layers else True))
    if args.dropout > 0:
      model.add(Dropout(args.dropout))
  for i in xrange(args.dense_layers):
    if i + 1 == args.dense_layers:
      model.add(Dense(args.hidden_size, activation='linear'))
    else:
      model.add(Dense(args.hidden_size, activation=args.dense_activation))

model.summary()
print "Saving model architecture to", args.model_path
open(args.model_path, 'w').write(model.to_json())

Creating model...
--------------------------------------------------------------------------------
Initial input shape: (None, 107149)
--------------------------------------------------------------------------------
Layer (name)                  Output Shape                  Param #             
--------------------------------------------------------------------------------
Embedding (embedding)         (None, None, 300)             32144700            
GRU (gru)                     (None, 1024)                  4070400             
--------------------------------------------------------------------------------
Total params: 36215100
--------------------------------------------------------------------------------
Saving model architecture to model/reference.json


In [9]:
print "Compiling model..."
if args.bidirectional:
  model.compile(optimizer=args.optimizer, loss={'output': cosine_ranking_loss})
else:
  model.compile(optimizer=args.optimizer, loss=cosine_ranking_loss)

Compiling model...


In [10]:
callbacks = [ModelCheckpoint(filepath=args.weights_path, verbose=1, save_best_only=False)]
generator = generate_sequences(args.data_path, tokenizer, args.bidirectional)

In [49]:
x,y = next(generator)

Lines: 300
Sequences: 300
X,y: (300, 39) (300, 1024)


In [40]:
def generate_sequences(data_path, tokenizer, bidirectional):
  while 1:
    with open(data_path, "r") as f:
      while True:
        lines = list(islice(f, args.batch_size))
        if not lines:
            print "End of file"
            break
        print "Lines:", len(lines)
        sequences = tokenizer.texts_to_sequences(lines)
        print "Sequences:", len(sequences)
        x = pad_sequences(sequences, maxlen=args.maxlen)
        y = np.empty((x.shape[0], args.hidden_size))
        print "X,y:", x.shape, y.shape
        if bidirectional:
          yield {'input': x, 'output': y}
        else:
          yield x, y

generator = generate_sequences(args.data_path, tokenizer, args.bidirectional)
x,y = next(generator)
print x.shape, y.shape
x,y = next(generator)
print x.shape, y.shape

Lines: 300
Sequences: 300
X,y: (300, 48) (300, 1024)
(300, 48) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 35) (300, 1024)
(300, 35) (300, 1024)


In [37]:
f = open(args.data_path, "r")
next_n_lines = list(islice(f, 10))
print next_n_lines
next_n_lines = list(islice(f, 10))
print next_n_lines

['between the visceral and parietal pericardium\n', 'pericardial caivty\n', 'patellar\n', 'Consistently repeating a measurement.\n', 'Precision\n', 'Inference\n', 'smallest units of matter\n', 'atoms\n', 'kneecap\n', 'Flourens and Broca conducted research that demonstrated a connection between\n']
['mind and the brain\n', 'stimulus and response\n', 'The seven bones in the ankle\n', 'Tarsals\n', 'very localized responses in one organ (relaxed situations)\n', 'a medium sized star; the center of our solar system\n', 'Sun\n', 'moon\n', 'the passing of materials and energy from one organism to an other\n', 'food chain\n']


In [34]:
list(it)

['patellar\n',
 'Consistently repeating a measurement.\n',
 'Precision\n',
 'Inference\n',
 'smallest units of matter\n',
 'atoms\n',
 'kneecap\n',
 'Flourens and Broca conducted research that demonstrated a connection between\n']

In [50]:
print "Fitting model..."
if args.bidirectional:
  history = model.fit_generator(generator, samples_per_epoch=args.samples_per_epoch,
      nb_epoch=args.epochs, verbose=args.verbose, callbacks=callbacks)
else:
  history = model.fit_generator(generator, samples_per_epoch=args.samples_per_epoch,
      nb_epoch=args.epochs, verbose=args.verbose, callbacks=callbacks)

Fitting model...
Lines: 300
Epoch 1/100
Sequences: 300
X,y: (300, 44) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 41) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 37) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 46) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 43) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 37) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 39) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 47) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 37) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 46) (300, 1024)
Lines: 300
Sequences: 300
X,y: (300, 37) (300, 1024)
     44/1000000 [..............................] - ETA: 24462s - loss: 0.1000Lines: 300
Sequences: 300
X,y: (300, 45) (300, 1024)
     85/1000000 [..............................] - ETA: 22317s - loss: 0.1000Lines: 300
Sequences: 300
X,y: (300, 43) (300, 1024)
    122/1000000 [..............................] - ETA: 23614s - loss: 0.1000Lines: 300
Sequences: 300
X,y: (300, 38) (300, 1024

KeyboardInterrupt: 