In [1]:
import numpy as np
import json
import time
import pickle
import sys
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import unicodedata
import json
from pprint import pprint
import random

In [2]:
SEED=43
random.seed=SEED

In [None]:
"""Training Dataset"""
with open('train-v2.0.json') as f:
    data = json.load(f)

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
"""Hyperparameters"""
word_embedding_dim=300
char_embedding_dim=300
q_words=50
c_words=399
vocab=[0]
max_word_len = 40

In [17]:
"""Convert GloVe file to a dictionary"""
def loadGloveModel(gloveFile):
    print "Loading Glove Model"
    f = open(gloveFile,'r')
    model = dict()
    embedding = []
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        model[word]=np.array([float(val) for val in splitLine[1:]])
    print "Done.",len(model)," words loaded!"
    return model

In [18]:
%%time
"""Load word as well as character embedding"""
model = loadGloveModel("GloVe/glove.840B.300d.txt")
model_char = loadGloveModel("glove.840B.300d-char.txt")

Loading Glove Model
Done. 2196016  words loaded!
Loading Glove Model
Done. 94  words loaded!
CPU times: user 5min, sys: 8.31 s, total: 5min 8s
Wall time: 5min 9s


In [None]:
vocab=model_char.keys()

In [None]:
"""Reduce a sentence to its word embedding, character embedding, a boolean vector which 
tells if any word is in GloVe dictionary or not and an integer array of start positions of every word"""
def preprocess(sentence):
    wnl = WordNetLemmatizer()
    temp = pos_tag(word_tokenize(sentence))
    y = []
    word_emb = []
    char_emb = []
    word_in_glove = []
    count=0
    word_zeros = np.zeros((300),dtype=float)
    for i,j in temp:
        y.append(count)
        if i==u'``' or i==u"''":
            x='"'
            count+=1
        else:
            if j[0].lower() in ['a','n','v']:
                #lemmatization
                temp_i = wnl.lemmatize(i,j[0].lower())
                # unicode normalization
                x=unicodedata.normalize('NFKD', temp_i).encode('ascii','ignore')
            else:
                temp_i = wnl.lemmatize(i)
                x=unicodedata.normalize('NFKD', temp_i).encode('ascii','ignore')
            count+=len(i)
        while count<len(sentence) and sentence[count]==' ':
            count+=1
        try:
            word_emb.append(model[x])
            word_in_glove.append(1)
        except:
            word_emb.append(word_zeros)
            word_in_glove.append(0)
        temp_char = []
        for k in range(len(x)):
            try:
                temp_char.append(vocab.index(x[k]))
            except:
                pass
        temp_char+=[len(vocab) for _ in range(max_word_len-len(temp_char))]
        char_emb.append(temp_char)
    char_emb=np.array(char_emb)
    word_emb=np.array(word_emb)
    return word_emb, char_emb, word_in_glove, y

In [None]:
"""Traverses over whole dataset and preprocesses every context and question, and builds an array of the dataset in
vectorized form"""
def create_dataset(data,test=True):
    data_map = []
    errors = 0
    t_count=0
    file_count=0
    random.shuffle(data['data'])
    for i in data['data']:
        t_count+=1
        print str(t_count)+str("."),
        print i['title'],
        print "Errors="+str(errors)
        for j in i['paragraphs']:
            c_word,c_char,c_bool,c_idx = preprocess(j['context'])
            if len(c_word)>c_words:
                continue
#             for x in range(len(idx)-1):
#                 print idx[x], j['context'][idx[x]:idx[x+1]]
#             print context
#             print idx
            for k in j['qas']:
                temp = {}
                q_word,q_char,q_bool,q_idx = preprocess(k['question'])
                if len(q_word)>q_words:
                    continue
                temp['title']=i['title']
                temp['context']=j['context']
                temp['contextword']=c_word
                temp['contextchar']=c_char
                temp['questionword']=q_word
                temp['questionchar']=q_char
                temp['contextbool']=c_bool
                temp['questionbool']=q_bool
                temp['is_impossible']=k['is_impossible']
                temp['context_indices']=c_idx
                try:
                    if k['is_impossible']:
                        if len(k['plausible_answers'])==0:
                            continue
                        #if impossible, select 1st plausible answer
                        ans_temp = k['plausible_answers'][0]
                        ans = pos_tag(word_tokenize(ans_temp['text'].lower()))
                        temp['plausible_start']=c_idx.index(ans_temp['answer_start'])
                        temp['plausible_end']=c_idx.index(ans_temp['answer_start'])+len(ans)
                        # set answer as []
                        temp['answer_start']=0
                        temp['answer_end']=0
                        temp['answer']=ans_temp['text']
                        data_map.append(temp)

                    else:
                        if len(k['answers'])==0:
                            continue
                        # if possible, select 1st answer
                        ans_temp = k['answers'][0]
                        ans = pos_tag(word_tokenize(ans_temp['text'].lower()))
                        temp['answer_start']=c_idx.index(ans_temp['answer_start'])
                        temp['answer_end']=c_idx.index(ans_temp['answer_start'])+len(ans)
                        # set plausible answer as []
                        temp['plausible_start']=0
                        temp['plausible_end']=0
                        temp['answer']=ans_temp['text']
                        data_map.append(temp)
                except:
                    errors+=1
                    
    if test==False:
        # Pickle train_dataset
        with open("train_data_unans.pkl",'w') as f:
            pickle.dump(data_map,f)
            print("Dumped")
    else:
        # Pickle test dataset
        with open('test_data_unans.pkl','w') as f:
            pickle.dump(data_map,f)      
    print "Errors="+str(errors)
    return data_map

In [None]:
%%time
"""Preprocess every data item for training dataset"""
dm = create_dataset(data,False)

In [None]:
with open('dev-v2.0.json') as f:
    data = json.load(f)

In [None]:
%%time
"""Preprocess every data item for testing dataset"""
data = create_dataset(data,True)