# Example Notebook to showcasing how we interact with JTReaders

In [1]:
# First change dir to JTR parent
import os
os.chdir('..')

### Bookkeeping of all existing readers: `readers.py`

In [2]:
import jack.readers as readers

In [3]:
print("Existing models:\n%s" % ", ".join(readers.readers.keys()))

Existing models:
complex_reader, distmult_reader, esim_snli_reader, bidaf_reader, cbilstm_snli_reader, modelf_reader, dam_snli_reader, transe_reader, fastqa_reader


### Create a reader

In [4]:
%%script bash
bash data/GloVe/download_small.sh

glove.6B.50d.txt already exists! Doing nothing!


In [5]:
from jack.io.embeddings.embeddings import load_embeddings
from jack.util.vocab import Vocab

# we need a vocabulary (with embeddings for our fastqa_reader, but this is not always necessary)
embeddings = load_embeddings('data/GloVe/glove.6B.50d.txt', 'glove')
vocab = Vocab(emb=embeddings, init_from_embeddings=True)

# and a config
config = {"repr_dim": 10, "repr_dim_input": embeddings.lookup.shape[1], "model": "fastqa_reader", "max_span_size": 10}

In [6]:
# create example reader
from jack.core import SharedResources

svac = SharedResources(vocab, config)
fastqa_reader = readers.readers["fastqa_reader"](svac)

### Setting up a reader from training data 

In [7]:
from jack.io.load import load_jack
train_data = load_jack('data/SQuAD/snippet.jtr.json')
# all parameters are initialized after this call
fastqa_reader.setup_from_data(train_data)

### Saving the reader

In [8]:
fastqa_reader.store("/tmp/fastqa_reader")

In [9]:
%%sh
ls /tmp/fastqa_reader/

checkpoint
model_module.data-00000-of-00001
model_module.index
model_module.meta
shared_resources
shared_resources_vocab


### Loading the reader

In [10]:
# we can simply load a setup reader
fastqa_reader.load("/tmp/fastqa_reader")

INFO:tensorflow:Restoring parameters from /tmp/fastqa_reader/model_module


In [11]:
import tensorflow as tf
# reset graph -> computation graph is gone
tf.reset_default_graph()

# or setup a new reader from file 
svac = SharedResources()
fastqa_reader = readers.readers["fastqa_reader"](svac)
fastqa_reader.load_and_setup("/tmp/fastqa_reader")

INFO:tensorflow:Restoring parameters from /tmp/fastqa_reader/model_module


In [12]:
# reset graph -> computation graph is gone
tf.reset_default_graph()

# or even shorter, use the utility function for creating and loading a reader from file
fastqa_reader = readers.reader_from_file("/tmp/fastqa_reader")

INFO:tensorflow:Restoring parameters from /tmp/fastqa_reader/model_module


### Applying the reader

In [13]:
from jack.io.load import load_jack
data = load_jack('data/SQuAD/snippet.jtr.json')

# take a list of inputs, e.g., from our training data
questions = [q for q, a in data]

In [14]:
# Of course the output is not correct because the model was not trained at all
for q, a in zip(questions, fastqa_reader(questions)):
    print("Question: " + q.question)
    print("Answer:   %s \t %.3f" % (a[0].text, a[0].score))
    print()

Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:   s gold dome is a golden statue of the 	 0.242

Question: What is in front of the Notre Dame Main Building?
Answer:   s gold dome is a golden statue of the 	 0.110

Question: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Answer:   s gold dome is a golden statue of the 	 0.225

Question: What is the Grotto at Notre Dame?
Answer:   s gold dome is a golden statue of the 	 0.280

Question: What sits on top of the Main Building at Notre Dame?
Answer:   s gold dome is a golden statue of the 	 0.187

Question: When did the Scholastic Magazine of Notre dame begin publishing?
Answer:   journal in September 1876, the 	 0.080

Question: How often is Notre Dame's the Juggler published?
Answer:   journal in September 1876, the 	 -0.032

Question: What is the daily student paper at Notre Dame called?
Answer:   journal in September 1876, the 	 -0.022

Question: How many student ne

# Training

In [16]:
# for training we use the bin/jack-train.py script, however, programatically we could quickly train a model like this
from jack.util.hooks import LossHook, ExamplesPerSecHook
import tensorflow as tf

# setup reader ini training mode
tf.reset_default_graph()
fastqa_reader = readers.readers["fastqa_reader"](svac)
fastqa_reader.setup_from_data(train_data, is_training=True)

batch_size=len(train_data)
hooks = [LossHook(fastqa_reader, iter_interval=1), 
         ExamplesPerSecHook(fastqa_reader, batch_size, iter_interval=1)]
optimizer = tf.train.AdamOptimizer(0.1)
fastqa_reader.train(optimizer, batch_size=batch_size, hooks=hooks, max_epochs=20, training_set=train_data)

INFO:jack.core.reader:Preparing training data...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:jack.core.reader:Number of parameters: 6341
INFO:jack.core.reader:Start training...
INFO:jack.util.hooks:Epoch 1	Iter 1	train loss 9.99771499633789
INFO:jack.util.hooks:Epoch 2	Iter 2	train loss 9.868426322937012
INFO:jack.util.hooks:Epoch 3	Iter 3	train loss 9.303072929382324
INFO:jack.util.hooks:Epoch 4	Iter 4	train loss 7.600021839141846
INFO:jack.util.hooks:Epoch 5	Iter 5	train loss 6.575923919677734
INFO:jack.util.hooks:Epoch 6	Iter 6	train loss 5.771175384521484
INFO:jack.util.hooks:Epoch 7	Iter 7	train loss 5.4432830810546875
INFO:jack.util.hooks:Epoch 8	Iter 8	train loss 4.33134126663208
INFO:jack.util.hooks:Epoch 9	Iter 9	train loss 4.106665134429932
INFO:jack.util.hooks:Epoch 10	Iter 10	train loss 3.571859121322632
INFO:jack.util.hooks:Epoch 11	Iter 11	train loss 3.0804221630096436
INFO:jack.util.hooks:Epoch 12	Iter 12	train loss 2.6599819660186768
INFO:jack.util.hooks:Epoch 13	Iter 13	train loss 2.335930109024048
INFO:jack.util.hooks:Epoch 14	Iter 14	train loss 2.321081

In [38]:
# After training the output should look better
for q, a in zip(questions, fastqa_reader(questions)):
    print("Question: " + q.question)
    print("Answer:   %s \t %.3f" % (a[0].text, a[0].score))
    print()

Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:   Saint Bernadette Soubirous 	 15.086

Question: What is in front of the Notre Dame Main Building?
Answer:   a copper statue of Christ 	 5.544

Question: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Answer:   the Main Building 	 2.587

Question: What is the Grotto at Notre Dame?
Answer:   a Marian place of prayer and reflection 	 7.989

Question: What sits on top of the Main Building at Notre Dame?
Answer:   a golden statue of the Virgin Mary 	 3.314

Question: When did the Scholastic Magazine of Notre dame begin publishing?
Answer:   September 1876 	 -8.032

Question: How often is Notre Dame's the Juggler published?
Answer:   twice 	 3.903

Question: What is the daily student paper at Notre Dame called?
Answer:   The Observer 	 -6.612

Question: How many student news papers are found at Notre Dame?
Answer:   three 	 -2.750

Question: In what year did the student pa

In [31]:

from collections import defaultdict
from typing import NamedTuple

import progressbar

from jack.core import *
from jack.readers.extractive_qa.shared import XQAPorts
from jack.tfutil import sequence_encoder
from jack.tfutil.xqa import xqa_crossentropy_loss
from jack.util import preprocessing
from jack.util.map import numpify

def get_answer_and_span(question, doc_idx, start, end, token_offsets, selected_support):
    doc_idx = selected_support[doc_idx]
    char_start = token_offsets[start]
    if end < len(token_offsets) - 1:
        char_end = token_offsets[end + 1]
    else:
        char_end = len(question.support[doc_idx])
    answer = question.support[doc_idx][char_start: char_end]
    answer = answer.rstrip()
    char_end = char_start + len(answer)
    return answer, doc_idx, (char_start, char_end)

class  MyXQAOutputModule(OutputModule):
    def __init__(self, shared_resources):
        self.beam_size = shared_resources.config.get("beam_size", 1)

    def __call__(self, questions, span_prediction,
                 token_offsets, selected_support, support2question,
                 start_scores, end_scores):
        all_answers = []
        for k, q in enumerate(questions):
            answers = []
            q_token_offsets = [to for q_id, to in zip(support2question, token_offsets) if q_id == k]
            for j in range(self.beam_size):
                i = k * self.beam_size + j
                doc_idx, start, end = span_prediction[i]
                score = start_scores[doc_idx, start]
                answer, doc_idx, span = get_answer_and_span(
                    q, doc_idx, start, end, q_token_offsets[doc_idx],
                    [i for q_id, i in zip(support2question, selected_support) if q_id == k])
                answers.append(Answer(answer, span=span, doc_idx=doc_idx, score=score))
            all_answers.append(answers)

        return all_answers

    @property
    def input_ports(self) -> List[TensorPort]:
        return [Ports.Prediction.answer_span, XQAPorts.token_offsets,
                XQAPorts.selected_support, XQAPorts.support2question,
                Ports.Prediction.start_scores, Ports.Prediction.end_scores]

In [32]:
fastqa_reader._output_module = MyXQAOutputModule(fastqa_reader.shared_resources)