<a href="https://colab.research.google.com/github/the-SQuAD-squad/IR-QA/blob/main/QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Init { form-width: "25%" }
import os
import random
import math
import numpy as np
import tensorflow as tf

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)


In [2]:
#@title df creation { form-width: "25%" }
import json
import pandas as pd

# dataset is copyed to public git repo for fast access within colab 
!wget 'https://raw.githubusercontent.com/the-SQuAD-squad/data/main/SQuAD/squad1.1.zip'
!unzip squad1.1.zip

with open("training_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (qas['answers'][0]['answer_start'], 
                    qas['answers'][0]['answer_start'] + len(qas['answers'][0]['text'])),
        'answer_text' : qas['answers'][0]['text']
      })

df_original = pd.DataFrame(rows)

--2021-01-22 10:49:59--  https://raw.githubusercontent.com/the-SQuAD-squad/data/main/SQuAD/squad1.1.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7638822 (7.3M) [application/zip]
Saving to: ‘squad1.1.zip’


2021-01-22 10:50:00 (48.3 MB/s) - ‘squad1.1.zip’ saved [7638822/7638822]

Archive:  squad1.1.zip
  inflating: training_set.json       


In [3]:
df_original

Unnamed: 0,id,title,passage,question,answer_idx,answer_text
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"(515, 541)",Saint Bernadette Soubirous
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"(188, 213)",a copper statue of Christ
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"(279, 296)",the Main Building
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,"(381, 420)",a Marian place of prayer and reflection
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,"(92, 126)",a golden statue of the Virgin Mary
...,...,...,...,...,...,...
87594,5735d259012e2f140011a09d,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,"(229, 235)",Oregon
87595,5735d259012e2f140011a09e,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,"(414, 421)",Rangoon
87596,5735d259012e2f140011a09f,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,"(476, 481)",Minsk
87597,5735d259012e2f140011a0a0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,"(199, 203)",1975


In [4]:
import nltk
import re 
import math

def preprocess_text(text):
    """
    Given an iterable containing sentences, pre-process each sentence.

    :param: 
        - text: list of text to be pre-processed (Iterable)
    :return:
        - text: pre-processed text (List)
    """


    REPLACE_WITH_SPACE = re.compile(r"\n") 
    text = [REPLACE_WITH_SPACE.sub(" ", line.lower()) for line in text]

    # we don't remove symbols, but just put a space before and after them. We did this because we noticed that Glove contains an embedding also for
    # them, so, in this way, we are able to split these symbols from the text when computing sentence tokens
    text = [re.sub(r"([(.;:!\'ˈ~?,\"(\[\])\\\/\-–\t```<>_#$€@%*+—°′″“”×’^₤₹‘])", r' \1 ', line) for line in text]

    # we noticed that in the text sometimes we find numbers and the following word merged together (ex: 1980february),
    # so we put a space between the number and the word
    text = [re.sub(r"(\d+)([a-z]+)", r'\1 \2', line) for line in text] 
    text = [re.sub('\s{2,}', ' ', line.strip()) for line in text]   # replacing more than one consecutive blank spaces with only one of them

    return text


# Creating a copy of the original dataframe (we do this because we want to be able to compare the results of our processing with the original data)
df = df_original.copy()

# pre-process Claim and Evidence text
df['passage'] = preprocess_text(df_original['passage'])
df['question'] = preprocess_text(df_original['question'])
df['answer_text'] = preprocess_text(df_original['answer_text'])

In [5]:
import random as rand
# Comparing Original and Pre-Processed
for i in range(3):
    a = rand.randint(0,1000)
    print('ORIGINAL AND PREPROCESSED PASSAGE:')
    print(df_original.iloc[a]['passage'])
    print(df.iloc[a]['passage'])
    
    print()
    print('ORIGINAL AND PREPROCESSED QUESTION:')
    print(df_original.iloc[a]['question'])
    print(df.iloc[a]['question'])
    print()


ORIGINAL AND PREPROCESSED PASSAGE:
In 2015 Beyoncé signed an open letter which the ONE Campaign had been collecting signatures for; the letter was addressed to Angela Merkel and Nkosazana Dlamini-Zuma, urging them to focus on women as they serve as the head of the G7 in Germany and the AU in South Africa respectively, which will start to set the priorities in development funding before a main UN summit in September 2015 that will establish new development goals for the generation.
in 2015 beyoncé signed an open letter which the one campaign had been collecting signatures for ; the letter was addressed to angela merkel and nkosazana dlamini - zuma , urging them to focus on women as they serve as the head of the g7 in germany and the au in south africa respectively , which will start to set the priorities in development funding before a main un summit in september 2015 that will establish new development goals for the generation .

ORIGINAL AND PREPROCESSED QUESTION:
An important UN summ

In [6]:
import numpy as np

def build_vocabulary(text):
    """
    Given a list of words, builds the corresponding word vocabulary and the mappings from words to indices and vice-versa.

    :param: 
        - text: list of words from which we want to build the vocabularies (List)
    :return:
        - idx2word: index to word mapping (Dict)
        - word2idx: word to index mapping (Dict)
        - set_vocab: set of unique terms that build up the vocabulary
    """

    # Creating a set to eliminate repeated words
    set_vocab = ['<PAD>']+sorted(set(text)) # here we add the padding token as the first element of the set

    # Creating a mapping from unique words to indices
    word2idx = {u:i for i, u in enumerate(set_vocab)}   # the padding token will have 0 index
    # Creating a mapping from indices to unique words
    idx2word = {i:u for i, u in enumerate(set_vocab)}

    return idx2word,word2idx,set_vocab

# Creating a list containing all the Claim and Evidence text splitted in words
text =  ' '.join(np.concatenate((df['passage'],df['question']))).split(' ')
# Displaying first 100 words
print(text[:100])
# calling the build_vocabulary function to obtain the vocab and the mappings
idx_to_word, word_to_idx, word_listing = build_vocabulary(text)

print('[Debug] Index -> Word vocabulary size: {}'.format(len(idx_to_word)))
print('[Debug] Word -> Index vocabulary size: {}'.format(len(word_to_idx)))
print('[Debug] Some words: {}'.format([(idx_to_word[idx], idx) for idx in np.arange(100)]))

['architecturally', ',', 'the', 'school', 'has', 'a', 'catholic', 'character', '.', 'atop', 'the', 'main', 'building', "'", 's', 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'virgin', 'mary', '.', 'immediately', 'in', 'front', 'of', 'the', 'main', 'building', 'and', 'facing', 'it', ',', 'is', 'a', 'copper', 'statue', 'of', 'christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', '"', 'venite', 'ad', 'me', 'omnes', '"', '.', 'next', 'to', 'the', 'main', 'building', 'is', 'the', 'basilica', 'of', 'the', 'sacred', 'heart', '.', 'immediately', 'behind', 'the', 'basilica', 'is', 'the', 'grotto', ',', 'a', 'marian', 'place', 'of', 'prayer', 'and', 'reflection', '.', 'it', 'is', 'a', 'replica', 'of', 'the', 'grotto', 'at', 'lourdes', ',', 'france', 'where', 'the', 'virgin', 'mary']
[Debug] Index -> Word vocabulary size: 82471
[Debug] Word -> Index vocabulary size: 82471
[Debug] Some words: [('<PAD>', 0), ('!', 1), ('"', 2), ('#', 3), ('$', 4), ('%', 5), ('&', 6), ("'", 7)

In [7]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type, embedding_dimension=50):
    """
    Loads a pre-trained word embedding model via gensim library.

    :params:
        - model_type: name of the word embedding model to load.
        - embedding_dimension: size of the embedding space to consider

    :return:
        - pre-trained word embedding model (gensim KeyedVectors object)
    """

    download_path = ""

    # Find the correct embedding model name
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)

    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove")

    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model


def check_OOV_terms(embedding_model, word_listing):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :params:
        - embedding_model: pre-trained word embedding model (gensim wrapper)
        - word_listing: dataset specific vocabulary (list)

    :return:
        - list of OOV terms
    """
    # Creating a list for the OOV words
    oov = []
    for word in word_listing:
        # Checking if the word is in the embedding_model
        if word not in embedding_model:
            oov.append(word)
    return oov


def build_embedding_matrix_w_random(embedding_model, embedding_dimension, word_to_idx, oov_terms):
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :params: 
        - embedding_model: pre-trained word embedding model (gensim wrapper)
        - word_to_idx: vocabulary map (word -> index) (dict)
        - oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = []
    for word in word_to_idx:
        if word in oov_terms:
            embedding_matrix.append(np.random.rand(embedding_dimension))
        else:
             embedding_matrix.append(embedding_model[word])
    return np.array(embedding_matrix)

# we used Glove with embedding dimension 100 for our final tests
embedding_model_type = "Glove"
embedding_dimension = 100
embedding_model = load_embedding_model(embedding_model_type, embedding_dimension)



In [8]:
# checking how many OOV terms we have
oov_terms = check_OOV_terms(embedding_model, word_listing)

print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_terms), len(oov_terms)/len(word_listing)*100))

embedding_matrix = build_embedding_matrix_w_random(embedding_model, embedding_dimension, word_to_idx, oov_terms)

print("Embedding matrix shape: {}".format(embedding_matrix.shape))

Total OOV terms: 15888 (19.26%)
Embedding matrix shape: (82471, 100)


In [None]:
print(oov_terms)   # this was useful to understand if we could improve pre-processing

['<PAD>', '0000222556', '0018', '0028', '0029670', '0042', '0043', '0054', '0065', '0071', '0079', '00794', '0243', '0307', '0358', '036', '042', '057', '058', '05946', '062', '064', '066', '067', '069', '073', '078', '079', '083', '085', '088', '096', '097', '10217', '10925', '10¢', '10−10', '10−12', '10−19', '10−2', '10−3', '10−34', '10−6', '10−8', '10−9', '1101010', '11092', '11114', '11172', '11246', '12232', '12291', '1234567', '12750', '131\u202f000', '13500', '13526', '13818', '14−17', '15408', '16041', '16384', '177847', '17½', '17\u202f000', '18578', '1881−82', '19˚n', '19˚s', '1\u202f800', '1⁄10', '1⁄14', '1⁄2', '1⁄3', '1⁄4', '1⁄8', '20011', '20012', '20015', '2002…', '2013−14', '20147', '20740', '20\u202f000', '21⁄32', '22300', '230000', '230873', '23456789', '236\u2009§\u20094', '236\u2009§\u20095', '236\u2009§\u20098', '2372', '237\u2009§\u200910', '237\u2009§\u200914', '238\u2009§\u200918', '24000', '24219', '24237', '2426', '2468', '24722', '2491', '2532', '2562', '2586'

In [9]:
#@title split { form-width: "25%" }

split_value = 0.1 #@param {type:"number"} 


val_dim = int(len(df['title'].unique()) * split_value)

val_titles = np.random.choice(df['title'].unique(), size=val_dim, replace=False)

In [None]:
unwanted_rows = set()

for i,sentence in enumerate(df['answer_text']):
    for el in sentence.split():
        try:
            word_to_idx[el]
        except:
            print(df_original.iloc[i]) 
            print(df_original['passage'][i])
            print()
            unwanted_rows.add(i)

for i,question in enumerate(df['question']):
    if len(question)<=10:
        print(df_original.iloc[i]) 
        print(df_original['passage'][i])
        print()
        unwanted_rows.add(i)

In [41]:
df

Unnamed: 0,id,title,passage,question,answer_idx,answer_text
0,5733be284776f41900661182,University_of_Notre_Dame,"architecturally , the school has a catholic ch...",to whom did the virgin mary allegedly appear i...,"(515, 541)",saint bernadette soubirous
1,5733be284776f4190066117f,University_of_Notre_Dame,"architecturally , the school has a catholic ch...",what is in front of the notre dame main buildi...,"(188, 213)",a copper statue of christ
2,5733be284776f41900661180,University_of_Notre_Dame,"architecturally , the school has a catholic ch...",the basilica of the sacred heart at notre dame...,"(279, 296)",the main building
3,5733be284776f41900661181,University_of_Notre_Dame,"architecturally , the school has a catholic ch...",what is the grotto at notre dame ?,"(381, 420)",a marian place of prayer and reflection
4,5733be284776f4190066117e,University_of_Notre_Dame,"architecturally , the school has a catholic ch...",what sits on top of the main building at notre...,"(92, 126)",a golden statue of the virgin mary
...,...,...,...,...,...,...
87594,5735d259012e2f140011a09d,Kathmandu,"kathmandu metropolitan city ( kmc ) , in order...",in what us state did kathmandu first establish...,"(229, 235)",oregon
87595,5735d259012e2f140011a09e,Kathmandu,"kathmandu metropolitan city ( kmc ) , in order...",what was yangon previously known as ?,"(414, 421)",rangoon
87596,5735d259012e2f140011a09f,Kathmandu,"kathmandu metropolitan city ( kmc ) , in order...",with what belorussian city does kathmandu have...,"(476, 481)",minsk
87597,5735d259012e2f140011a0a0,Kathmandu,"kathmandu metropolitan city ( kmc ) , in order...",in what year did kathmandu create its initial ...,"(199, 203)",1975


In [None]:
df_clean = df.drop(list(unwanted_rows))
df_clean = df_clean.reset_index(drop=True)
df_clean

In [70]:
#for val_title in val_titles:
df_val = df_clean[df_clean['title'].isin(val_titles)].reset_index(drop=True)
df_train = df_clean[~(df_clean['title'].isin(val_titles))].reset_index(drop=True)

In [75]:
# PADDING
# all sequences in train and val sets will be padded with a number of tokens equal to the maximum train sentence length 
MAX_LENGTH_PASSAGE = len(max(df_clean['passage'], key=len))   
train_passage = [[word_to_idx[el] for el in sentence.split()] for sentence in df_train['passage']]  # train claim sentences extraction
train_passage_pad = tf.keras.preprocessing.sequence.pad_sequences(train_passage, maxlen=MAX_LENGTH_PASSAGE, padding='post') # padding train claim sentences

MAX_LENGTH_QUESTION = len(max(df_clean['question'], key=len))   
train_question = [[word_to_idx[el] for el in sentence.split()] for sentence in df_train['question']]  # train evidence sentences extraction
train_question_pad = tf.keras.preprocessing.sequence.pad_sequences(train_question, maxlen=MAX_LENGTH_QUESTION, padding='post') # padding train evidence sentences


val_passage = [[word_to_idx[el] for el in sentence.split()] for sentence in df_val['passage']]  # val claim sentences extraction
val_passage_pad = tf.keras.preprocessing.sequence.pad_sequences(val_passage, maxlen=MAX_LENGTH_PASSAGE, padding='post') # padding val claim sentences

val_question = [[word_to_idx[el] for el in sentence.split()] for sentence in df_val['question']]  # val evidence sentences extraction
val_question_pad = tf.keras.preprocessing.sequence.pad_sequences(val_question, maxlen=MAX_LENGTH_QUESTION, padding='post') # padding val evidence sentences

#val_answer = [[word_to_idx[el] for el in sentence.split()] for sentence in df_val['answer_text']]

In [None]:
#we have repetition of the answer in the text, it's a problem
# train_answer = [[word_to_idx[el] for el in sentence.split()] for sentence in df_train['answer_text']]


# for i,answer in enumerate(train_answer):
#     answer = np.array(answer)
#     passage = np.array(train_passage[i])
#     start_index = [i for i in range(len(passage)-len(answer)+1) if (answer==passage[i:i+len(answer)]).all()]
#     if len(start_index)> 1:
#         print(start_index)
#         for el in start_index:
#             print(idx_to_word[passage[el]])
#     label = (start_index[0],start_index[0]+len(answer))
#     df_train['answer_idx'][i] = label

In [None]:
train_answer = [[word_to_idx[el] for el in sentence.split()] for sentence in df_train['answer_text']]

for i,answer_idx in enumerate(df_train['answer_idx']):
    df_train['passage'][i].count(' ')

In [45]:
val_answer = [[word_to_idx[el] for el in sentence.split()] for sentence in df_val['answer_text']]