In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import json, re, unicodedata, string, typing, time
import torch.nn.functional as F
import spacy
from collections import Counter
import pickle
from nltk import word_tokenize
nlp = spacy.load('en_core_web_sm')
from preprocess import *

In [2]:
# load dataset json files

train_data = load_json('./dataset/squad_train.json')
test_data = load_json('./dataset/squad_test.json')

# parse the json structure to return the data as a list of dictionaries

train_list = parse_data(train_data)
test_list = parse_data(test_data)

print('Train list len: ',len(train_list))
print('Test list len: ',len(test_list))

# converting the lists into dataframes

train_df = pd.DataFrame(train_list)
test_df = pd.DataFrame(test_list)

def normalize_spaces(text):
    '''
    Removes extra white spaces from the context.
    '''
    text = re.sub(r'\s', ' ', text)
    return text

train_df.context = train_df.context.apply(normalize_spaces)
test_df.context = test_df.context.apply(normalize_spaces)
train_df.head()

Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  University_of_Notre_Dame
Length of data:  48
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Super_Bowl_50
Train list len:  87599
Test list len:  34726


Unnamed: 0,id,context,question,label,answer
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"[515, 541]",Saint Bernadette Soubirous
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"[188, 213]",a copper statue of Christ
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"[279, 296]",the Main Building
3,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,"[381, 420]",a Marian place of prayer and reflection
4,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,"[92, 126]",a golden statue of the Virgin Mary


In [3]:
# get indices of outliers and drop them from the dataframe

drop_ids_train = filter_large_examples(train_df)
train_df.drop(list(drop_ids_train), inplace=True)

drop_ids_test = filter_large_examples(test_df)
test_df.drop(list(drop_ids_test), inplace=True)

In [4]:
len(train_df), len(test_df)

(87335, 34439)

In [5]:
# gather text to build vocabularies

vocab_text = gather_text_for_vocab([train_df, test_df])
print("Number of sentences in dataset: ", len(vocab_text))
vocab_text[:3]

Number of sentences in dataset:  118441


['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 "As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued

In [6]:
# build word and character-level vocabularies
print("---------------word vocabulary-------------------")
word2idx, idx2word, word_vocab = build_word_vocab(vocab_text)
print("------------character vocabulary-----------------")
char2idx, char_vocab = build_char_vocab(vocab_text)

---------------word vocabulary-------------------
raw-vocab: 110472
vocab-length: 110474
word2idx-length: 110474
------------character vocabulary-----------------
raw-char-vocab: 1397
char-vocab-intersect: 230
char2idx-length: 232


In [7]:
np.save('./dataset/qa_word2idx.npy',word2idx)
np.save('./dataset/qa_idx2word.npy',idx2word)
np.save('./dataset/qa_word_vocab.npy',word_vocab)
np.save('./dataset/qa_char2idx.npy',char2idx)
np.save('./dataset/qa_char_vocab.npy',char_vocab)

In [8]:
# numericalize context and questions for training and testing set


train_df['context_ids'] = train_df.context.apply(context_to_ids, word2idx=word2idx)
test_df['context_ids'] = test_df.context.apply(context_to_ids, word2idx=word2idx)

train_df['question_ids'] = train_df.question.apply(question_to_ids,  word2idx=word2idx)
test_df['question_ids'] = test_df.question.apply(question_to_ids,  word2idx=word2idx)

In [9]:
# get indices with tokenization errors and drop those indices 

train_err = get_error_indices(train_df, idx2word)
test_err = get_error_indices(test_df, idx2word)

train_df.drop(train_err, inplace=True)
test_df.drop(test_err, inplace=True)

Number of error indices: 1000
Number of error indices: 428


In [10]:
# get start and end positions of answers from the context
# this is basically the label for training QA models

train_label_idx = train_df.apply(index_answer, axis=1, idx2word=idx2word)
test_label_idx = test_df.apply(index_answer, axis=1, idx2word=idx2word)

train_df['label_idx'] = train_label_idx
test_df['label_idx'] = test_label_idx

In [11]:
# Dump data to pickle files
import pickle
with open('./dataset/qaw2id.pickle','wb') as handle:
    pickle.dump(word2idx, handle)

with open('./dataset/qac2id.pickle','wb') as handle:
    pickle.dump(char2idx, handle)
    
train_df.to_pickle('./dataset/qatrain.pkl')
test_df.to_pickle('./dataset/qatest.pkl')

In [12]:
def create_glove_matrix():
    '''
    Parses the glove word vectors text file and returns a dictionary with the words as
    keys and their respective pretrained word vectors as values.

    '''
    glove_dict = {}
    with open("./dataset/glove.840B.300d.txt", "r", encoding="utf-8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            glove_dict[word] = vector

    f.close()
    
    return glove_dict
glove_dict = create_glove_matrix()

In [13]:
def create_word_embedding(glove_dict):
    '''
    Creates a weight matrix of the words that are common in the GloVe vocab and
    the dataset's vocab. Initializes OOV words with a zero vector.
    '''
    weights_matrix = np.zeros((len(word_vocab), 300))
    words_found = 0
    for i, word in enumerate(word_vocab):
        try:
            weights_matrix[i] = glove_dict[word]
            words_found += 1
        except:
            pass
    return weights_matrix, words_found

In [14]:
weights_matrix, words_found = create_word_embedding(glove_dict)
print("Total words found in glove vocab: ", words_found)

Total words found in glove vocab:  91193


In [15]:
np.save('./dataset/qaglove_vt.npy',weights_matrix)