In [1]:
import argparse
import csv
from random import shuffle
from model import *
from preprocess import *

Using Theano backend.


Using gpu device 0: Tesla K20m (CNMeM is disabled)


In [2]:
  parser = argparse.ArgumentParser()
  parser.add_argument("model_path")
  parser.add_argument("output_path")
  parser.add_argument("--data_path", default="/storage/hpc_tanel/allenAI/studystack_qa_cleaner_no_qm.txt")
  parser.add_argument("--load_tokenizer", default="model/tokenizer_studystack_full.pkl")
  parser.add_argument("--macrobatch_size", type=int, default=1000)
  parser.add_argument("--min_margin", type=float)
  parser.add_argument("--max_margin", type=float)
  parser.add_argument("--load_arch")
  parser.add_argument("--save_arch")
  add_model_params(parser)
  add_training_params(parser)
  add_data_params(parser)
  args = parser.parse_args("--data_path /storage/hpc_tanel/allenAI/studystack_qa_cleaner_no_qm.txt --load_tokenizer ../model/tokenizer_studystack_full.pkl ../model/studystack_margin0.2_30_acc_0.4132.hdf5 test.txt".split())


In [3]:
  print "Loading tokenizer..."
  tokenizer = load_tokenizer(args.load_tokenizer)
  vocab_size = vocabulary_size(tokenizer)
  print "Vocabulary size:", vocab_size

Loading tokenizer...
Vocabulary size: 108891


In [4]:
  if args.load_arch:
    print "Loading model architecture from", args.load_arch
    model = model_from_json(args.load_arch)
  else:
    print "Creating model..."
    model = create_model(vocab_size, args)

Creating model...


In [5]:
  model.summary()

--------------------------------------------------------------------------------
Initial input shape: (None, 108891)
--------------------------------------------------------------------------------
Layer (name)                  Output Shape                  Param #             
--------------------------------------------------------------------------------
Embedding (embedding)         (None, None, 300)             32667300            
GRU (gru)                     (None, 1024)                  4070400             
--------------------------------------------------------------------------------
Total params: 36737700
--------------------------------------------------------------------------------


In [6]:
  print "Loading weights from %s" % args.model_path
  model.load_weights(args.model_path)

  print "Compiling model..."
  compile_model(model, args)

Loading weights from ../model/studystack_margin0.2_30_acc_0.4132.hdf5
Compiling model...


In [7]:
  print "Sampling data to", args.output_path
  output = open(args.output_path, "a")

Sampling data to test.txt


In [8]:
f = open(args.data_path)

In [9]:
reader = csv.reader(f, delimiter="\t", strict=True, quoting=csv.QUOTE_NONE)

In [10]:
        # read macrobatch_size lines from reader
        lines = list(islice(reader, args.macrobatch_size))
        print "Lines:", len(lines)

Lines: 1000


In [11]:
        shuffle(lines)
        ids, questions, answers = zip(*lines)
        print "ids:", len(ids), "questions:", len(questions), "answers:", len(answers)

ids: 1000 questions: 1000 answers: 1000


In [12]:
        texts = questions + answers
        print "texts:", len(texts)
        data = text_to_data(texts, tokenizer, args.maxlen)
        print "data:", data.shape

texts: 2000
data: (2000, 49)


In [13]:
        pred = predict_data(model, data, args)
        print "pred:", pred.shape
        half = int(pred.shape[0] / 2)
        question_vectors = pred[0:half]
        answer_vectors = pred[half:]
        print "question_vectors:", question_vectors.shape, "answer_vectors.shape", answer_vectors.shape

pred: (2000, 1024)
question_vectors: (1000, 1024) answer_vectors.shape (1000, 1024)


In [14]:
 %%timeit
    for i, q in enumerate(question_vectors):
          q = q[np.newaxis, ...]
          sims = np_cosine_similarity(q, answer_vectors)
          sorted = reversed(np.argsort(sims))
          #print ""
          #print "question %d:" % i, questions[i]
          for j in sorted:
            #print "answer %d:" % j, answers[j], "(correct answer: %s)" % answers[i]
            #print "similarity:", sims[j], "(margin %f)" % (sims[i] - sims[j])
            if j != i and answers[j].strip().lower() != answers[i].strip().lower() \
                and (args.min_margin is None or sims[i] - sims[j] > args.min_margin):
              if (args.max_margin is None or sims[i] - sims[j] < args.max_margin):
                output.write(questions[i]+"\n")
                output.write(answers[i]+"\n")
                output.write(answers[j]+"\n")
              else:
                print "discarded"
              break

1 loops, best of 3: 10.9 s per loop


In [20]:
%%timeit
q = question_vectors[0]
q = q[np.newaxis, ...]

The slowest run took 32.09 times longer than the fastest. This could mean that an intermediate result is being cached 
1000000 loops, best of 3: 565 ns per loop


In [21]:
q = question_vectors[0]
q = q[np.newaxis, ...]

In [22]:
%%timeit
sims = np_cosine_similarity(q, answer_vectors)

100 loops, best of 3: 10.7 ms per loop


In [23]:
%%timeit
for i, q in enumerate(question_vectors):
      q = q[np.newaxis, ...]

1000 loops, best of 3: 539 µs per loop


In [24]:
%%timeit
for i, q in enumerate(question_vectors):
      q = q[np.newaxis, ...]
      sims = np_cosine_similarity(q, answer_vectors)

1 loops, best of 3: 10.9 s per loop


In [25]:
question_vectors.shape

(1000, 1024)

In [26]:
%%timeit

def np_l2_normalize(x, axis):
    norm = np.sqrt(np.sum(np.square(x), axis=axis, keepdims=True))
    return x / norm

def np_cosine_similarity(y_true, y_pred):
    assert y_true.ndim == 2
    assert y_pred.ndim == 2
    y_true = np_l2_normalize(y_true, axis=1)
    y_pred = np_l2_normalize(y_pred, axis=1)
    return np.sum(y_true * y_pred, axis=1, keepdims=False)

sims = np_cosine_similarity(q, answer_vectors)

100 loops, best of 3: 10.9 ms per loop


In [27]:
%%timeit
np_l2_normalize(q, axis=1)

The slowest run took 8.46 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 26.1 µs per loop


In [28]:
%%timeit
np_l2_normalize(answer_vectors, axis=1)

100 loops, best of 3: 7.21 ms per loop


In [29]:
%%timeit
np.sum(q * answer_vectors, axis=1, keepdims=False)

100 loops, best of 3: 2.57 ms per loop


In [30]:
x = answer_vectors
axis = 1

In [31]:
%%timeit
norm = np.sqrt(np.sum(np.square(x), axis=axis, keepdims=True))
y = x / norm

100 loops, best of 3: 7.44 ms per loop


In [32]:
%%timeit
x1 = np.square(x)

1000 loops, best of 3: 1.58 ms per loop


In [33]:
x1 = np.square(x)

In [34]:
%%timeit
np.sum(x1, axis=axis, keepdims=True)

1000 loops, best of 3: 655 µs per loop


In [35]:
x2 = np.sum(x1, axis=axis, keepdims=True)
print x2.shape

(1000, 1)


In [36]:
%%timeit
norm = np.sqrt(x2)

The slowest run took 17.92 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 7.2 µs per loop


In [37]:
norm = np.sqrt(x2)
print x.shape, norm.shape

(1000, 1024) (1000, 1)


In [38]:
%%timeit
x / norm

100 loops, best of 3: 5.02 ms per loop


In [40]:
from sklearn.metrics.pairwise import pairwise_distances

In [41]:
dists = pairwise_distances(question_vectors, answer_vectors, metric="cosine", n_jobs=1)

In [42]:
dists.shape

(1000, 1000)

In [44]:
q = question_vectors[0]
q = q[np.newaxis, ...]
sims = np_cosine_similarity(q, answer_vectors)
sims.shape

(1000,)

In [45]:
sims[:10]

array([ 0.46994926,  0.10139717,  0.12391221,  0.35504785,  0.17364977,
        0.17945529,  0.09990746,  0.09201521,  0.07195911,  0.04709271])

In [48]:
1-dists[0,:10]

array([ 0.46994926,  0.10139717,  0.12391221,  0.35504785,  0.17364977,
        0.17945529,  0.09990746,  0.09201521,  0.07195911,  0.04709271])

In [50]:
import string
print string.punctuation

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
