<a href="https://colab.research.google.com/github/the-SQuAD-squad/IR/blob/bert/IR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install wikipedia-api

In [None]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')
cat = wiki_wiki.page("Category:Physics")
print("Category members: Category:Physics")
for p in cat.categorymembers.values():
  if p.namespace == wikipediaapi.Namespace.CATEGORY:
    # it is category, so you have to make decision
    # if you want to fetch also text from pages that belong
    # to this category
    print(p)
  elif p.namespace == wikipediaapi.Namespace.MAIN:
    # it is page => we can get text
    print(p)
    print(p.text)

In [None]:
def print_categorymembers(categorymembers, level=0, max_level=3):
        for c in categorymembers.values():
            print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
                print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)

cat = wiki_wiki.page("Category:Artificial intelligence")
print_categorymembers(cat.categorymembers)

In [None]:
wiki_wiki = wikipediaapi.Wikipedia('en')

page_py = wiki_wiki.page('Python_(programming_language)')
print(page_py.text)

In [1]:
#@title Init { form-width: "25%" }
import os
import random
import math
import numpy as np
import tensorflow as tf
import json
import pandas as pd
import re
import string
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers

!pip install tokenizers
from tokenizers import BertWordPieceTokenizer

pd.set_option('display.max_colwidth', -1)

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

# BERT params
max_seq_length = 512
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)


Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 4.1MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.1




In [2]:
#@title df creation { form-width: "25%" }

# the official dataset is identical to the provided one
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O training_set.json

with open("training_set.json", "r") as f:
    json_file = json.load(f)
data = json_file["data"]

rows = []
for document in data:
  for par in document['paragraphs']:
    for qas in par['qas']:
      rows.append({
        'id' : qas['id'],
        'title': document["title"],
        'passage': par['context'],
        'question' : qas['question'],
        'answer_idx' : (qas['answers'][0]['answer_start'], 
                    qas['answers'][0]['answer_start'] + len(qas['answers'][0]['text'])),
        'answer_text' : qas['answers'][0]['text']
      })

df_original = pd.DataFrame(rows)

--2021-02-17 11:11:39--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [application/json]
Saving to: ‘training_set.json’


2021-02-17 11:11:40 (53.3 MB/s) - ‘training_set.json’ saved [30288272/30288272]



In [None]:
#@title preprocessing { form-width: "25%" }

import nltk
import re 
import math
import random as rand

def preprocess_text(text):
    """
    Given an iterable containing sentences, pre-process each sentence.

    :param: 
        - text: list of text to be pre-processed (Iterable)
    :return:
        - text: pre-processed text (List)
    """

    REPLACE_WITH_SPACE = re.compile(r"\n") 
    text = [REPLACE_WITH_SPACE.sub(" ", line) for line in text]

    # we don't remove symbols, but just put a space before and after them. We did this because we noticed that Glove contains an embedding also for
    # them, so, in this way, we are able to split these symbols from the text when computing sentence tokens
    text = [re.sub(r"([(.;:!\'ˈ~?,\"(\[\])\\\/\-–\t```<>_#$€@%*+—°′″“”×’^₤₹‘])", r'', line) for line in text]

    # we noticed that in the text sometimes we find numbers and the following word merged together (ex: 1980february),
    # so we put a space between the number and the word
    text = [re.sub(r"(\d+)([a-z]+)", r'\1 \2', line) for line in text] 
    text = [re.sub('\s{2,}', ' ', line.strip()) for line in text]   # replacing more than one consecutive blank spaces with only one of them

    return text


# Creating a copy of the original dataframe (we do this because we want to be able to compare the results of our processing with the original data)
df = df_original.copy()

# pre-process passage and question text
df['passage'] = preprocess_text(df_original['passage'])
df['question'] = preprocess_text(df_original['question'])
df['answer_text'] = preprocess_text(df_original['answer_text'])

# Comparing Original and Pre-Processed
for i in range(3):
    a = rand.randint(0,1000)
    print('ORIGINAL AND PREPROCESSED PASSAGE:')
    print(df_original.iloc[a]['passage'])
    print(df.iloc[a]['passage'])
    
    print()
    print('ORIGINAL AND PREPROCESSED QUESTION:')
    print(df_original.iloc[a]['question'])
    print(df.iloc[a]['question'])
    print()

In [17]:
df["passage"]=df["passage"].str.lower()
df["question"]=df["question"].str.lower()
df["answer_text"]=df["answer_text"].str.lower()

In [64]:
#@title clean dataset { form-width: "25%" }

!gcloud config set project feisty-mechanic-221914
!gsutil cp gs://squad_squad/error_IDs.txt ./error_IDs.txt

with open("error_IDs.txt", "r") as f:
    unwanted_id = f.read()

unwanted_id = unwanted_id.split("\n")[:-1]
df_clean = df.set_index('id')
df_clean = df_clean.drop(unwanted_id)

df_original_clean = df_original.set_index('id')
df_original_clean = df_original_clean.drop(unwanted_id)

Updated property [core/project].
Copying gs://squad_squad/error_IDs.txt...
/ [1 files][  5.7 KiB/  5.7 KiB]                                                
Operation completed over 1 objects/5.7 KiB.                                      


In [65]:
#@title split { form-width: "25%" }

split_value = 0.1 #@param {type:"number"} 
val_dim = int(len(df_clean['title'].unique()) * split_value)
test_titles = np.random.choice(df_clean['title'].unique(), size=val_dim, replace=False)

# creating train and val sets
df_test = df_clean[df_clean['title'].isin(test_titles)]
df_train = df_clean[~(df_clean['title'].isin(test_titles))]

df_original_test = df_original_clean[df_original_clean['title'].isin(test_titles)]
df_original_train = df_original_clean[~(df_original_clean['title'].isin(test_titles))]


In [67]:
from sklearn.feature_extraction.text import  TfidfVectorizer

#passage_dict = {passage : id for passage, id in zip(df_train['passage'],df_train.index)}

passage_dict_prep_train = list(set(df_train['passage'].tolist()))
passage_dict_train = list(set(df_original_train['passage'].tolist()))

vectorizer =  TfidfVectorizer()
# tokenization and creation of Bag of Words representation
passage_tf_idf = vectorizer.fit_transform(passage_dict_prep_train)


In [73]:
passage_dict_prep_test = list(set(df_test['passage'].tolist()))
passage_dict_test = list(set(df_original_test['passage'].tolist()))
passage_dict = {passage : id for passage, id in enumerate(df_test['passage'])}

In [68]:
passage_test = vectorizer.transform(df_test['passage'].tolist())

In [69]:
question_tf_idf = vectorizer.transform(df_test['question'].tolist())

In [45]:
from sklearn.metrics.pairwise import cosine_similarity
results = cosine_similarity(passage_tf_idf,question_tf_idf[1000,:])

In [46]:
np.argmax(results)

994

In [23]:
print(X_tf_idf.shape)

(87364, 95582)


In [71]:
df_clean

Unnamed: 0_level_0,title,passage,question,answer_idx,answer_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5733be284776f41900661182,University_of_Notre_Dame,architecturally the school has a catholic character atop the main buildings gold dome is a golden statue of the virgin mary immediately in front of the main building and facing it is a copper statue of christ with arms upraised with the legend venite ad me omnes next to the main building is the basilica of the sacred heart immediately behind the basilica is the grotto a marian place of prayer and reflection it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858 at the end of the main drive and in a direct line that connects through 3 statues and the gold dome is a simple modern stone statue of mary,to whom did the virgin mary allegedly appear in 1858 in lourdes france,"(515, 541)",saint bernadette soubirous
5733be284776f4190066117f,University_of_Notre_Dame,architecturally the school has a catholic character atop the main buildings gold dome is a golden statue of the virgin mary immediately in front of the main building and facing it is a copper statue of christ with arms upraised with the legend venite ad me omnes next to the main building is the basilica of the sacred heart immediately behind the basilica is the grotto a marian place of prayer and reflection it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858 at the end of the main drive and in a direct line that connects through 3 statues and the gold dome is a simple modern stone statue of mary,what is in front of the notre dame main building,"(188, 213)",a copper statue of christ
5733be284776f41900661180,University_of_Notre_Dame,architecturally the school has a catholic character atop the main buildings gold dome is a golden statue of the virgin mary immediately in front of the main building and facing it is a copper statue of christ with arms upraised with the legend venite ad me omnes next to the main building is the basilica of the sacred heart immediately behind the basilica is the grotto a marian place of prayer and reflection it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858 at the end of the main drive and in a direct line that connects through 3 statues and the gold dome is a simple modern stone statue of mary,the basilica of the sacred heart at notre dame is beside to which structure,"(279, 296)",the main building
5733be284776f41900661181,University_of_Notre_Dame,architecturally the school has a catholic character atop the main buildings gold dome is a golden statue of the virgin mary immediately in front of the main building and facing it is a copper statue of christ with arms upraised with the legend venite ad me omnes next to the main building is the basilica of the sacred heart immediately behind the basilica is the grotto a marian place of prayer and reflection it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858 at the end of the main drive and in a direct line that connects through 3 statues and the gold dome is a simple modern stone statue of mary,what is the grotto at notre dame,"(381, 420)",a marian place of prayer and reflection
5733be284776f4190066117e,University_of_Notre_Dame,architecturally the school has a catholic character atop the main buildings gold dome is a golden statue of the virgin mary immediately in front of the main building and facing it is a copper statue of christ with arms upraised with the legend venite ad me omnes next to the main building is the basilica of the sacred heart immediately behind the basilica is the grotto a marian place of prayer and reflection it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to saint bernadette soubirous in 1858 at the end of the main drive and in a direct line that connects through 3 statues and the gold dome is a simple modern stone statue of mary,what sits on top of the main building at notre dame,"(92, 126)",a golden statue of the virgin mary
...,...,...,...,...,...
5735d259012e2f140011a09d,Kathmandu,kathmandu metropolitan city kmc in order to promote international relations has established an international relations secretariat irc kmcs first international relationship was established in 1975 with the city of eugene oregon united states this activity has been further enhanced by establishing formal relationships with 8 other cities motsumoto city of japan rochester of the usa yangon formerly rangoon of myanmar xian of the peoples republic of china minsk of belarus and pyongyang of the democratic republic of korea kmcs constant endeavor is to enhance its interaction with saarc countries other international agencies and many other major cities of the world to achieve better urban management and developmental programs for kathmandu,in what us state did kathmandu first establish an international relationship,"(229, 235)",oregon
5735d259012e2f140011a09e,Kathmandu,kathmandu metropolitan city kmc in order to promote international relations has established an international relations secretariat irc kmcs first international relationship was established in 1975 with the city of eugene oregon united states this activity has been further enhanced by establishing formal relationships with 8 other cities motsumoto city of japan rochester of the usa yangon formerly rangoon of myanmar xian of the peoples republic of china minsk of belarus and pyongyang of the democratic republic of korea kmcs constant endeavor is to enhance its interaction with saarc countries other international agencies and many other major cities of the world to achieve better urban management and developmental programs for kathmandu,what was yangon previously known as,"(414, 421)",rangoon
5735d259012e2f140011a09f,Kathmandu,kathmandu metropolitan city kmc in order to promote international relations has established an international relations secretariat irc kmcs first international relationship was established in 1975 with the city of eugene oregon united states this activity has been further enhanced by establishing formal relationships with 8 other cities motsumoto city of japan rochester of the usa yangon formerly rangoon of myanmar xian of the peoples republic of china minsk of belarus and pyongyang of the democratic republic of korea kmcs constant endeavor is to enhance its interaction with saarc countries other international agencies and many other major cities of the world to achieve better urban management and developmental programs for kathmandu,with what belorussian city does kathmandu have a relationship,"(476, 481)",minsk
5735d259012e2f140011a0a0,Kathmandu,kathmandu metropolitan city kmc in order to promote international relations has established an international relations secretariat irc kmcs first international relationship was established in 1975 with the city of eugene oregon united states this activity has been further enhanced by establishing formal relationships with 8 other cities motsumoto city of japan rochester of the usa yangon formerly rangoon of myanmar xian of the peoples republic of china minsk of belarus and pyongyang of the democratic republic of korea kmcs constant endeavor is to enhance its interaction with saarc countries other international agencies and many other major cities of the world to achieve better urban management and developmental programs for kathmandu,in what year did kathmandu create its initial international relationship,"(199, 203)",1975
