In [13]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations

from args import get_test_args
from collections import OrderedDict
import json
from json import dumps
from models import BiDAF, QANet, UnifiedQANet
from os.path import join
from tensorboardX import SummaryWriter
from tqdm import tqdm
from ujson import load as json_load
from util import collate_fn, SQuAD, metric_max_over_ground_truths, compute_em, compute_f1, compute_avna

In [2]:
train_dataset = SQuAD('./data/train.npz', True)
dev_dataset = SQuAD('./data/dev.npz', True)

In [3]:
word_vectors = util.torch_from_json('./data/word_emb.json')
char_vectors = util.torch_from_json('./data/char_emb.json')

In [4]:
print(word_vectors.shape)
print(char_vectors.shape)

torch.Size([88714, 300])
torch.Size([1376, 64])


In [583]:
with open('./data/word2idx.json', "r") as fh:
    word2idx = json.load(fh)
idx2word = {v: k for k, v in word2idx.items()}

In [584]:
context_idx = dev_dataset.context_idxs[0]

def idx_to_context(context_idx):
    tmp = [idx2word[idx.item()] for idx in context_idx]
    return tmp

In [585]:
dev_dataset.y1s

tensor([35, 22, 56,  ...,  0,  0,  0])

In [586]:
question_head = []
context_list = []
question_list = []
ground_truth_answer = []
for x, y, y1, y2 in zip(dev_dataset.question_idxs, dev_dataset.context_idxs, dev_dataset.y1s, dev_dataset.y2s):
    for i in range(1, len(x)):
        if idx2word[x[i].item()] != '--OOV--' and idx2word[x[i].item()] != '--NULL--':
            question_head.append(idx2word[x[i].item()].lower())
            break
    context = idx_to_context(y)
    context_list.append(' '.join(context))
    question_list.append(' '.join(idx_to_context(x)))
    if y1 != 0 and y2 != 0:
        ground_truth_answer.append(' '.join(context[y1:y2+1]))
    else:
        ground_truth_answer.append(' ')
question_head

['in',
 'when',
 'from',
 'who',
 'what',
 'who',
 'what',
 'who',
 'when',
 'who',
 'who',
 'what',
 'what',
 'who',
 'who',
 'who',
 'what',
 'what',
 'when',
 'what',
 'when',
 'when',
 'who',
 'what',
 'when',
 'what',
 'who',
 'what',
 'who',
 'when',
 'what',
 'what',
 'what',
 'what',
 'what',
 'what',
 'what',
 'who',
 'what',
 'who',
 'who',
 'who',
 'who',
 'who',
 'during',
 'when',
 'when',
 'who',
 'who',
 'when',
 'who',
 'when',
 'what',
 'who',
 'who',
 'who',
 'who',
 'what',
 'where',
 'what',
 'where',
 'who',
 'what',
 'what',
 'when',
 'how',
 'who',
 'what',
 'what',
 'how',
 'where',
 'who',
 'what',
 'who',
 'who',
 'who',
 'when',
 'what',
 'where',
 'who',
 'who',
 'what',
 'who',
 'who',
 'to',
 'who',
 'who',
 'when',
 'who',
 'who',
 'when',
 'who',
 'when',
 'what',
 'who',
 'where',
 'who',
 'when',
 'who',
 'when',
 'what',
 'who',
 'who',
 'what',
 'who',
 'what',
 'who',
 'in',
 'what',
 'what',
 'where',
 'who',
 'what',
 'who',
 'who',
 'when',
 'who

In [587]:
ys_train = list(zip(train_dataset.y1s.numpy(), train_dataset.y2s.numpy()))
ys_dev = list(zip(dev_dataset.y1s.numpy(), dev_dataset.y2s.numpy())) 
df_train = pd.DataFrame(ys_train, columns=['y1', 'y2'])
df_dev = pd.DataFrame(ys_dev, columns=['y1', 'y2'])

In [588]:
device, gpu_ids = util.get_available_devices()

In [39]:
model = UnifiedQANet(word_vectors=word_vectors,
              char_vectors=char_vectors,
              hidden_size=128,
              num_head=4)

In [40]:
def load_model(model, checkpoint_path, gpu_ids, return_step=True):
    device = 'cpu'
    ckpt_dict = torch.load(checkpoint_path, map_location=device)

    # Build model, load parameters
    model.load_state_dict(ckpt_dict['model_state'], strict=False)

    if return_step:
        step = ckpt_dict['step']
        return model, step

    return model

In [41]:
model = nn.DataParallel(model, gpu_ids)
model = load_model(model, 'save/train/uqanet-02/best.pth.tar', None, return_step=False)
model = model.to(device)
model.eval()

DataParallel(
  (module): UnifiedQANet(
    (emb): Embedding(
      (word_emb): Embedding(88714, 300)
      (char_emb): Embedding(1376, 64)
      (seg_emb): Embedding(2, 128)
      (conv2d): Conv2d(64, 128, kernel_size=(1, 5), stride=(1, 1))
      (conv1d_word): FeedForward(
        (out): Linear(in_features=300, out_features=128, bias=False)
      )
      (conv1d): FeedForward(
        (out): Linear(in_features=256, out_features=128, bias=False)
      )
      (hwy): HighwayEncoder(
        (transforms): ModuleList(
          (0): Linear(in_features=128, out_features=128, bias=True)
          (1): Linear(in_features=128, out_features=128, bias=True)
        )
        (gates): ModuleList(
          (0): Linear(in_features=128, out_features=128, bias=True)
          (1): Linear(in_features=128, out_features=128, bias=True)
        )
      )
    )
    (emb_enc_blks): ModuleList(
      (0): EncoderBlock(
        (convs): ModuleList(
          (0): DepthwiseSeparableConv(
            (depth

In [42]:
dataset = dev_dataset
data_loader = data.DataLoader(dataset,
                              batch_size=64,
                              shuffle=False,
                              num_workers=4,
                              collate_fn=collate_fn)

In [43]:
nll_meter = util.AverageMeter()
pred_dict = {}  # Predictions for TensorBoard
eval_file = './data/dev_eval.json'
with open(eval_file, 'r') as fh:
    gold_dict = json_load(fh)
with torch.no_grad(), \
        tqdm(total=len(dataset)) as progress_bar:
    for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
        # Setup for forward
        cw_idxs = cw_idxs.to(device)
        qw_idxs = qw_idxs.to(device)
        cc_idxs = cc_idxs.to(device)
        qc_idxs = qc_idxs.to(device)
        batch_size = cw_idxs.size(0)

        # Forward
        log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
        y1, y2 = y1.to(device), y2.to(device)
        loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
        nll_meter.update(loss.item(), batch_size)

        # Get F1 and EM scores
        p1, p2 = log_p1.exp(), log_p2.exp()
        starts, ends = util.discretize(p1, p2, 15, True)

        # Log info
        progress_bar.update(batch_size)

        idx2pred, uuid2pred = util.convert_tokens(gold_dict,
                                                  ids.tolist(),
                                                  starts.tolist(),
                                                  ends.tolist(),
                                                  True)
        pred_dict.update(idx2pred)

100%|█████████████████████████████████████████████████████████████████████████████| 5951/5951 [00:13<00:00, 428.90it/s]


In [589]:
df = pd.DataFrame([[len(x.split()) for x in pred_dict.values()], question_head, context_list]).T
df.columns = ['predicted_len', 'question_head', 'context']

In [590]:
em = []
f1 = []
avna = []
for key, value in pred_dict.items():
    ground_truths = gold_dict[key]['answers']
    prediction = value

    em.append(metric_max_over_ground_truths(compute_em, prediction, ground_truths))
    f1.append(metric_max_over_ground_truths(compute_f1, prediction, ground_truths))
    avna.append(compute_avna(prediction, ground_truths))

In [591]:
df = pd.DataFrame([em, f1, avna, question_head]).T
df.columns = ['EM', 'F1', 'AvNA', 'question_head']

In [592]:
df.groupby('question_head').mean()

Unnamed: 0_level_0,EM,F1,AvNA
question_head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"""",1.000000,1.000000,1.00000
-,0.000000,0.000000,0.00000
1,0.000000,0.000000,0.00000
2,0.000000,0.000000,0.00000
51.6,1.000000,1.000000,1.00000
...,...,...,...
why,0.595238,0.676061,0.77381
william,0.000000,0.000000,0.00000
with,0.000000,0.800000,1.00000
within,1.000000,1.000000,1.00000


In [599]:
tmp = pd.value_counts(df.question_head).sort_values(ascending=False)

In [604]:
key_types = ['what', 'who', 'how', 'when', 'where', 'which', 'why']
key_types

['what', 'who', 'how', 'when', 'where', 'which', 'why']

In [605]:
for i, q in enumerate(question_head):
    if q not in key_types or q=='in' or q=='the':
        question_head[i] = 'other'

In [606]:
df = pd.DataFrame([em, f1, avna, question_head, context_list, question_list, pred_dict.values(), ground_truth_answer]).T
df.columns = ['EM', 'F1', 'AvNA', 'question_head', 'context', 'question', 'pred_answer', 'true_answer']

In [607]:
df

Unnamed: 0,EM,F1,AvNA,question_head,context,question,pred_answer,true_answer
0,1,1.0,1.0,other,--OOV-- The Normans ( Norman : --OOV-- ; Frenc...,--OOV-- In what country is Normandy located ? ...,France,France
1,1,1.0,1.0,when,--OOV-- The Normans ( Norman : --OOV-- ; Frenc...,--OOV-- When were the Normans in Normandy ? --...,10th and 11th centuries,10th and 11th centuries
2,1,1.0,1.0,other,--OOV-- The Normans ( Norman : --OOV-- ; Frenc...,--OOV-- From which countries did the Norse ori...,"Denmark, Iceland and Norway","Denmark , Iceland and Norway"
3,1,1.0,1.0,who,--OOV-- The Normans ( Norman : --OOV-- ; Frenc...,--OOV-- Who was the Norse leader ? --NULL-- --...,Rollo,Rollo
4,1,1.0,1.0,what,--OOV-- The Normans ( Norman : --OOV-- ; Frenc...,--OOV-- What century did the Normans first gai...,10th,10th
...,...,...,...,...,...,...,...,...
5946,0,0,0.0,what,--OOV-- The pound - force has a metric counter...,--OOV-- What is the seldom used force unit equ...,,--OOV--
5947,1,1,1.0,what,--OOV-- The pound - force has a metric counter...,--OOV-- What does not have a metric counterpar...,,
5948,1,1,1.0,what,--OOV-- The pound - force has a metric counter...,--OOV-- What is the force exerted by standard ...,,
5949,1,1,1.0,what,--OOV-- The pound - force has a metric counter...,--OOV-- What force leads to a commonly used un...,,


In [608]:
tmp = (df.groupby("question_head").mean()*100)
tmp

  tmp = (df.groupby("question_head").mean()*100)


Unnamed: 0_level_0,EM,F1,AvNA
question_head,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
how,64.952381,69.322802,74.857143
other,63.920208,67.098779,74.674761
what,66.317311,69.862453,75.966751
when,75.0,75.851732,81.363636
where,62.068966,66.595574,75.0
which,68.493151,74.414052,82.876712
who,67.880795,70.323837,74.503311
why,59.52381,67.606063,77.380952


In [611]:
df.groupby("question_head").count()['EM'].sort_values()

question_head
why        84
which     146
where     232
when      440
how       525
who       604
other    1153
what     2767
Name: EM, dtype: int64

In [609]:
pd.DataFrame([tmp.loc[h] for h in key_types]).T

Unnamed: 0,what,who,how,when,where,which,why
EM,66.317311,67.880795,64.952381,75.0,62.068966,68.493151,59.52381
F1,69.862453,70.323837,69.322802,75.851732,66.595574,74.414052,67.606063
AvNA,75.966751,74.503311,74.857143,81.363636,75.0,82.876712,77.380952


In [603]:
bad_example = df[(df.question_head == 'the') & (df.EM==0)]#.iloc[2]
bad_example

Unnamed: 0,EM,F1,AvNA,question_head,context,question,pred_answer,true_answer


In [561]:
bad_example = df[(df.question_head == 'what') & (df.EM==0)].iloc[2]
bad_example

EM                                                               0
F1                                                               0
AvNA                                                           0.0
question_head                                                 what
context          --OOV-- The Norman dynasty had a major politic...
question         --OOV-- What type of major impact did the Norm...
pred_answer                political, cultural and military impact
true_answer                                                       
Name: 12, dtype: object

In [562]:
bad_example.context

'--OOV-- The Norman dynasty had a major political , cultural and military impact on medieval Europe and even the Near East . The Normans were famed for their martial spirit and eventually for their Christian piety , becoming exponents of the Catholic orthodoxy into which they assimilated . They adopted the Gallo - Romance language of the Frankish land they settled , their dialect becoming known as Norman , --OOV-- or Norman French , an important literary language . The Duchy of Normandy , which they formed by treaty with the French crown , was a great fief of medieval France , and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure . The Normans are noted both for their culture , such as their unique Romanesque architecture and musical traditions , and for their significant military accomplishments and innovations . Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantin

In [563]:
bad_example.question

'--OOV-- What type of major impact did the Norman dynasty have on modern Europe ? --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL--'

In [564]:
bad_example.pred_answer

'political, cultural and military impact'

In [565]:
bad_example.true_answer

' '

In [635]:
bad_example = df.iloc[3098]
bad_example

EM                                                               0
F1                                                               0
AvNA                                                           0.0
question_head                                                  why
context          --OOV-- The reason for the order of the classe...
question         --OOV-- Why were Northern Chinese ranked highe...
pred_answer                                                       
true_answer                                       they surrendered
Name: 3098, dtype: object

In [636]:
bad_example.context

'--OOV-- The reason for the order of the classes and the reason why people were placed in a certain class was the date they surrendered to the Mongols , and had nothing to do with their ethnicity . The earlier they surrendered to the Mongols , the higher they were placed , the more the held out , the lower they were ranked . The Northern Chinese were ranked higher and Southern Chinese were ranked lower because southern China withstood and fought to the last before caving in . Major commerce during this era gave rise to favorable conditions for private southern Chinese manufacturers and merchants . --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL--

In [637]:
bad_example.question

'--OOV-- Why were Northern Chinese ranked higher ? --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL--'

In [638]:
bad_example.pred_answer

''

In [639]:
bad_example.true_answer

'they surrendered'

In [531]:
bad_example = df[(df.question_head == 'when')&(df.EM==0)&(df.F1==0)&(df.pred_answer!= ' ')&(df.true_answer!= ' ')].loc[3741]
bad_example

EM                                                               0
F1                                                               0
AvNA                                                           1.0
question_head                                                 when
context          --OOV-- Immediately after Decision Time a " Me...
question         --OOV-- When is the Members Debate held ? --NU...
pred_answer                                             45 minutes
true_answer                        Immediately after Decision Time
Name: 3741, dtype: object

In [532]:
bad_example.context

'--OOV-- Immediately after Decision Time a " Members Debate " is held , which lasts for 45 minutes . Members Business is a debate on a motion proposed by an --OOV-- who is not a Scottish minister . Such motions are on issues which may be of interest to a particular area such as a member \'s own constituency , an upcoming or past event or any other item which would otherwise not be accorded official parliamentary time . As well as the --OOV-- , other members normally contribute to the debate . The relevant minister , whose department the debate and motion relate to " winds up " the debate by speaking after all other participants . --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --

In [533]:
bad_example.question

'--OOV-- When is the Members Debate held ? --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL-- --NULL--'

In [534]:
bad_example.pred_answer

'45 minutes'

In [535]:
bad_example.true_answer

'Immediately after Decision Time'