## Import Libraries

In [1]:
import pandas as pd                                                        #data import
from nltk import word_tokenize                                             #word tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer                  #character tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences          #padding
from tqdm import tqdm                                                      #track progress
import numpy as np                                                         #for numpy operations
from sklearn.model_selection import train_test_split                       #to split the data into train and test sets
import pickle                                                              #to save tokenizers
from copy import deepcopy
from numpy import save, load

Using TensorFlow backend.


## Data Import

In [4]:
df = pd.read_pickle("answerable_records.pkl")

In [7]:
df.head()

Unnamed: 0,question,answer,context
0,When did Beyonce start becoming popular?,in the late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,What areas did Beyonce compete in when she was...,singing and dancing,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2,When did Beyonce leave Destiny's Child and bec...,2003,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3,In what city and state did Beyonce grow up?,"Houston, Texas",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4,In which decade did Beyonce become famous?,late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...


## Data Cleaning

In [8]:
#removing records that do not have an answer
df = df[df['answer'].notnull()]

## PTB Tokenizer to tokenize the words and get start and end indices of the answer span

In [9]:
#looping through records and getting unique words from the dataset using PTBTokenizer and mapping them to tokens
word_dictionary = {}

for i, row in tqdm(df.iterrows()):
    
    text = row['question'] + " " + row['context']
    tokens = word_tokenize(text)
    
    for j in tokens:
        if j not in word_dictionary.keys():
            word_dictionary[j] = len(word_dictionary)

82532it [01:13, 1128.11it/s]


In [10]:
print("Number of unique words in the total dataset(train + test): ",len(word_dictionary))

110624


In [11]:
def get_start_end_words(row):
    """Function that loops through the context to find out the start and end indices 
    of the answer span using PTB tokenized words"""    
    
    answer = word_tokenize(row['answer'])
    context = word_tokenize(row['context'])
    
    start_word=end_word=-1
    
    match=False
   
    for j in range(len(context)-len(answer)):
        if context[j]==answer[0]:
            match=True
            k=0
            for k in range(1, len(answer)):
                if context[j+k]!=answer[k]:
                    match=False
            if match==True:
                start_word=j
                end_word=j+k
                break
  
    row['start_word'] = start_word
    row['end_word'] = end_word

    return row

In [13]:
tqdm.pandas()
dataset =  df_answerable.progress_apply(get_start_end_words, axis=1)

  from pandas import Panel
100%|███████████████████████████████████████████████████████████████████████████| 82532/82532 [02:42<00:00, 508.79it/s]


In [14]:
#removing records for which the function was not able to find the answer span(some more cleaning)
dataset = dataset[dataset['start_word']!=-1]

In [16]:
dataset.head()

Unnamed: 0,question,answer,context,start_word,end_word
0,When did Beyonce start becoming popular?,in the late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,50,53
1,What areas did Beyonce compete in when she was...,singing and dancing,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,38,40
2,When did Beyonce leave Destiny's Child and bec...,2003,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,104,104
3,In what city and state did Beyonce grow up?,"Houston, Texas",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,30,32
4,In which decade did Beyonce become famous?,late 1990s,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,52,53


## Train Test Split

In [19]:
df_train, df_test = train_test_split(dataset, test_size = 0.2)

In [20]:
print(df_train.shape, df_test.shape)

(64769, 5) (16193, 5)


## Word and Character Tokenization of Question and Context

In [None]:
#building word tokenization dictionary of question and context
word_tokenizer = {'PAD':0,'UNK':1}

for i, row in tqdm(df_train.iterrows()):
    
    text = row['question'] + " " + row['context']
    tokens = word_tokenize(text)
    
    for j in tokens:
        if j not in word_tokenizer.keys():
            word_tokenizer[j] = len(word_tokenizer)

64769it [01:19, 809.96it/s]


In [None]:
#building character tokenization dictionary of question and context
char_tokenizer = Tokenizer(char_level=True, oov_token='UNK')
char_tokenizer.fit_on_texts(df_train['question'] + df_train['context'])

In [None]:
def tokenize_entries(row):
    """Function that tokenizes(both word and character level) the question and context of all records"""
    question = word_tokenize(row['question'])
    context = word_tokenize(row['context'])
    
    question_word_tokens = []
    context_word_tokens = []
    question_char_tokens = []
    context_char_tokens = []
    
    for i in question:
        if i in word_tokenizer.keys():
            question_word_tokens.append(word_tokenizer[i])
            question_char_tokens.append(char_tokenizer.texts_to_sequences([i])[0])
        else:
            question_word_tokens.append(word_tokenizer['UNK'])
            question_char_tokens.append(char_tokenizer.texts_to_sequences([i])[0])
            
    for i in context:
        if i in word_tokenizer.keys():
            context_word_tokens.append(word_tokenizer[i])
            context_char_tokens.append(char_tokenizer.texts_to_sequences([i])[0])
        else:
            context_word_tokens.append(word_tokenizer['UNK'])
            context_char_tokens.append(char_tokenizer.texts_to_sequences([i])[0])
            
    row['question_word_tokens'] = question_word_tokens
    row['context_word_tokens'] = context_word_tokens
    row['question_char_tokens'] = question_char_tokens
    row['context_char_tokens'] = context_char_tokens
    
    return row

In [None]:
tqdm.pandas()

In [None]:
df_train = df_train.progress_apply(tokenize_entries, axis=1)

100%|██████████| 64769/64769 [05:36<00:00, 192.35it/s]


In [None]:
df_test = df_test.progress_apply(tokenize_entries, axis=1)

100%|██████████| 16193/16193 [01:20<00:00, 201.90it/s]


In [None]:
#dataset after tokenization
df_train.head()

Unnamed: 0,question,answer,context,start_word,end_word,question_word_tokens,context_word_tokens,question_char_tokens,context_char_tokens
26862,What was replaced by the spinning wheel?,the traditional distaff,"In agriculture, the increased usage of sheep w...",26,28,"[2, 3, 4, 5, 6, 7, 8, 9]","[10, 11, 12, 6, 13, 14, 15, 16, 17, 18, 19, 20...","[[20, 11, 5, 4], [20, 5, 9], [10, 3, 18, 12, 5...","[[6, 7], [5, 19, 10, 6, 14, 15, 12, 4, 15, 10,..."
91742,Around how many camps were set up by the Germa...,45 camps,"Shortly after the end of the war in May 1945, ...",80,81,"[102, 103, 104, 105, 62, 106, 107, 5, 6, 108, ...","[111, 112, 6, 113, 15, 6, 114, 82, 115, 116, 1...","[[5, 10, 8, 15, 7, 13], [11, 8, 20], [16, 5, 7...","[[9, 11, 8, 10, 4, 12, 21], [5, 17, 4, 3, 10],..."
15093,Who recognized hydrogen gas as a discreet subs...,Henry Cavendish,"In 1671, Robert Boyle discovered and described...",29,30,"[182, 183, 184, 185, 49, 21, 186, 187, 9]","[10, 188, 12, 189, 190, 191, 88, 192, 6, 193, ...","[[20, 11, 8], [10, 3, 14, 8, 19, 7, 6, 39, 3, ...","[[6, 7], [27, 46, 45, 27], [23], [10, 8, 22, 3..."
22826,On what date did the Japanese land on Enewetak?,"September 29, 1914","In 1914, Japan joined the Entente during World...",24,27,"[247, 248, 249, 250, 6, 251, 252, 59, 253, 9]","[10, 254, 12, 255, 256, 6, 257, 109, 258, 259,...","[[8, 7], [20, 11, 5, 4], [13, 5, 4, 3], [13, 6...","[[6, 7], [27, 33, 27, 44], [23], [34, 5, 18, 5..."
63456,"How many ""voices"" did Montini's posters claim ...",1000,"During his period in Milan, Montini was known ...",65,65,"[312, 104, 206, 313, 209, 250, 314, 246, 315, ...","[145, 321, 322, 82, 318, 12, 314, 3, 323, 49, ...","[[11, 8, 20], [16, 5, 7, 21], [471, 471], [24,...","[[13, 15, 10, 6, 7, 19], [11, 6, 9], [18, 3, 1..."


## Computation of Max Word and Character lengths for Padding

In [None]:
question_word_token_lens = []

for i in df_train['question_word_tokens'].values:
    question_word_token_lens.extend([len(i)])
    
print("Max number of words in question: ",np.array(question_word_token_lens).max())
print("Mean number of words in question: ",np.array(question_word_token_lens).mean())

Max number of words in question:  32
Mean number of words in question:  11.133196436566877


In [None]:
context_word_token_lens = []

for i in df_train['context_word_tokens'].values:
    context_word_token_lens.extend([len(i)])
    
print("Max number of words in context: ",np.array(context_word_token_lens).max())
print("Mean number of words in context: ",np.array(context_word_token_lens).mean())

Max number of words in context:  340
Mean number of words in context:  131.99289783692817


In [None]:
question_max = np.array(question_word_token_lens).max()
context_max = np.array(context_word_token_lens).max()

In [None]:
char_max = 0
total_len = 0
total_words = 0
dist = []

for i, row in tqdm(df_train.iterrows()):
    for j in row['question_char_tokens']:
        dist.append(len(j))
        total_len += len(j)
        total_words += 1
        if len(j)>char_max:
            char_max = len(j)
            
            
    for k in row['context_char_tokens']:
        dist.append(len(k))
        total_len += len(j)
        total_words += 1
        if len(k)>char_max:
            char_max = len(k)

print("Maximum length of word: ", char_max)
print("Mean length of word: ", total_len/total_words)
print("99.9 percentage of words have a character length of : ", np.percentile(dist,99.9))

64769it [00:18, 3574.98it/s]


Maximum length of word:  37
Mean length of word:  1.3062994894198647
99.9 percentage of words have a character length of :  16.0


In [None]:
char_max = 15

In [None]:
#Number of unique words and characters in the train dataset
print(len(word_tokenizer))
print(len(char_tokenizer.word_index))

109302
1218


## Padding Word Sequences

In [None]:
def pad_word_sequences(row):
    """Function that does padding for word tokens"""
    
    question = row['question_word_tokens'].copy()
    for i in range(len(question), question_max):
        question.append(word_tokenizer['PAD'])
        
    context = row['context_word_tokens'].copy()
    for i in range(len(context), context_max):
        context.append(word_tokenizer['PAD'])
        
    question = np.array(question[:question_max], dtype=np.int32)
    context = np.array(context[:context_max], dtype=np.int32)
    
    row['question_word_padded'] = question
    row['context_word_padded'] = context
    
    return row

In [None]:
df_train = df_train.progress_apply(pad_word_sequences, axis=1)

100%|██████████| 64769/64769 [01:59<00:00, 541.27it/s]


In [None]:
df_test = df_test.progress_apply(pad_word_sequences, axis=1)

100%|██████████| 16193/16193 [00:29<00:00, 543.37it/s]


## Padding Character Sequences

In [None]:
def pad_char_sequences(row):
    """Function that does padding for character tokens"""
    
    question = deepcopy(row['question_char_tokens'])
    question_chars = []
    for i in question:
        for j in range(len(i), char_max):
            i.append(0)
        question_chars.append(np.array(i[:char_max], dtype=np.int32))
    
    for i in range(len(question_chars), question_max):
        question_chars.append(np.zeros(char_max, dtype=np.int32))
        
    context = deepcopy(row['context_char_tokens'])
    context_chars = []
    for i in context:
        for j in range(len(i), char_max):
            i.append(0)
        context_chars.append(np.array(i[:char_max], dtype=np.int32))
        
    for i in range(len(context_chars), context_max):
        context_chars.append(np.zeros(char_max, dtype=np.int32))
        
    question_chars = np.array(question_chars, dtype=np.int32)
    context_chars = np.array(context_chars, dtype=np.int32)
    
    row['question_char_padded'] = question_chars
    row['context_char_padded'] = context_chars
    
    return row

In [None]:
df_train = df_train.progress_apply(pad_char_sequences, axis=1)

100%|██████████| 64769/64769 [03:26<00:00, 314.23it/s]


In [None]:
df_test = df_test.progress_apply(pad_char_sequences, axis=1)

100%|██████████| 16193/16193 [00:51<00:00, 312.06it/s]


In [None]:
#dataset after padding
df_train.head()

Unnamed: 0,question,answer,context,start_word,end_word,question_word_tokens,context_word_tokens,question_char_tokens,context_char_tokens,question_word_padded,context_word_padded,question_char_padded,context_char_padded
87248,Who is the second largest business district em...,La Défense,The second-largest business district in terms ...,9,10,"[2, 3, 4, 5, 6, 7, 8, 9, 10]","[11, 12, 7, 8, 13, 14, 15, 16, 3, 17, 18, 19, ...","[[20, 11, 8], [6, 9], [4, 11, 3], [9, 3, 14, 8...","[[4, 11, 3], [9, 3, 14, 8, 7, 13, 31, 12, 5, 1...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0, 0, 0, 0,...","[11, 12, 7, 8, 13, 14, 15, 16, 3, 17, 18, 19, ...","[[20, 11, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[4, 11, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
49617,Which language tree groups Dutch with English?,Indo-European,"Within the Indo-European language tree, Dutch ...",2,2,"[73, 74, 75, 76, 77, 78, 79, 10]","[80, 4, 81, 74, 75, 19, 77, 3, 82, 83, 4, 84, ...","[[20, 11, 6, 14, 11], [12, 5, 7, 19, 15, 5, 19...","[[20, 6, 4, 11, 6, 7], [4, 11, 3], [6, 7, 13, ...","[73, 74, 75, 76, 77, 78, 79, 10, 0, 0, 0, 0, 0...","[80, 4, 81, 74, 75, 19, 77, 3, 82, 83, 4, 84, ...","[[20, 11, 6, 14, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[20, 6, 4, 11, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27058,What is the act of analyzing morphophones called?,morphophonology,"Since the early 1960s, theoretical linguists h...",49,49,"[124, 3, 4, 125, 15, 126, 127, 128, 10]","[129, 4, 130, 131, 19, 132, 133, 115, 134, 135...","[[20, 11, 5, 4], [6, 9], [4, 11, 3], [5, 14, 4...","[[9, 6, 7, 14, 3], [4, 11, 3], [3, 5, 10, 12, ...","[124, 3, 4, 125, 15, 126, 127, 128, 10, 0, 0, ...","[129, 4, 130, 131, 19, 132, 133, 115, 134, 135...","[[20, 11, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[9, 6, 7, 14, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
117665,Who is TrainOSE a subsidiary of?,the Hellenic Railways Organization,"Greece's rail network is estimated to be at 2,...",24,27,"[2, 3, 158, 66, 159, 15, 10]","[160, 103, 161, 162, 3, 163, 99, 151, 142, 164...","[[20, 11, 8], [6, 9], [4, 10, 5, 6, 7, 8, 9, 3...","[[19, 10, 3, 3, 14, 3], [37, 9], [10, 5, 6, 12...","[2, 3, 158, 66, 159, 15, 10, 0, 0, 0, 0, 0, 0,...","[160, 103, 161, 162, 3, 163, 99, 151, 142, 164...","[[20, 11, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[19, 10, 3, 3, 14, 3, 0, 0, 0, 0, 0, 0, 0, 0,..."
16521,What is a first person singular feature of th...,non-ending,"In the Balearic Islands, IEC's standard is use...",79,79,"[124, 3, 66, 210, 211, 212, 213, 15, 4, 214, 1...","[31, 4, 214, 215, 19, 216, 103, 177, 3, 217, 2...","[[20, 11, 5, 4], [6, 9], [5], [17, 6, 10, 9, 4...","[[6, 7], [4, 11, 3], [22, 5, 12, 3, 5, 10, 6, ...","[124, 3, 66, 210, 211, 212, 213, 15, 4, 214, 1...","[31, 4, 214, 215, 19, 216, 103, 177, 3, 217, 2...","[[20, 11, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]..."


## Copying the padded sequences to input data arrays

In [None]:
#creating train input arrays
train_context_word_padded = np.asarray(df_train['context_word_padded'].values.tolist(), dtype=np.int32)
train_question_word_padded = np.asarray(df_train['question_word_padded'].values.tolist(), dtype=np.int32)
test_context_word_padded = np.asarray(df_test['context_word_padded'].values.tolist(), dtype=np.int32)
test_question_word_padded = np.asarray(df_test['question_word_padded'].values.tolist(), dtype=np.int32)

In [None]:
#creating test input arrays
train_context_char_padded = np.asarray(df_train['context_char_padded'].values.tolist())
train_question_char_padded = np.asarray(df_train['question_char_padded'].values.tolist())
test_context_char_padded = np.asarray(df_test['context_char_padded'].values.tolist())
test_question_char_padded = np.asarray(df_test['question_char_padded'].values.tolist())

## Creating Output Arrays

In [None]:
#creating output arrays
num_classes = context_max
y_start_train = keras.utils.to_categorical(df_train['start_word'].values, num_classes)
y_end_train = keras.utils.to_categorical(df_train['end_word'].values, num_classes)

y_start_test = keras.utils.to_categorical(df_test['start_word'].values, num_classes)
y_end_test = keras.utils.to_categorical(df_test['end_word'].values, num_classes)

## Saving all the required variables to disk

In [None]:
#saving word tokenizer
with open('word_tokenizer.pickle', 'wb') as handle:
    pickle.dump(word_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
%cp word_tokenizer.pickle "drive/My Drive/Colab Notebooks/dataset/"

In [None]:
#saving character tokenizer
with open('char_tokenizer.pickle', 'wb') as handle:
    pickle.dump(char_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
%cp char_tokenizer.pickle "drive/My Drive/Colab Notebooks/dataset/"

In [None]:
#saving train input data
save("drive/My Drive/Colab Notebooks/dataset/train_arrays/train_context_word_padded.npy", train_context_word_padded)
save("drive/My Drive/Colab Notebooks/dataset/train_arrays/train_question_word_padded.npy", train_question_word_padded)
save("drive/My Drive/Colab Notebooks/dataset/train_arrays/train_context_char_padded.npy", train_context_char_padded)
save("drive/My Drive/Colab Notebooks/dataset/train_arrays/train_question_char_padded.npy", train_question_char_padded)

In [None]:
#saving test input data
save("drive/My Drive/Colab Notebooks/dataset/test_arrays/test_context_word_padded.npy", test_context_word_padded)
save("drive/My Drive/Colab Notebooks/dataset/test_arrays/test_question_word_padded.npy", test_question_word_padded)
save("drive/My Drive/Colab Notebooks/dataset/test_arrays/test_context_char_padded.npy", test_context_char_padded)
save("drive/My Drive/Colab Notebooks/dataset/test_arrays/test_question_char_padded.npy", test_question_char_padded)

In [None]:
#saving output data
save("drive/My Drive/Colab Notebooks/dataset/train_arrays/y_start_train.npy", y_start_train)
save("drive/My Drive/Colab Notebooks/dataset/train_arrays/y_end_train.npy", y_end_train)
save("drive/My Drive/Colab Notebooks/dataset/test_arrays/y_start_test.npy", y_start_test)
save("drive/My Drive/Colab Notebooks/dataset/test_arrays/y_end_test.npy", y_end_test)