In [None]:
from google.colab import files
uploaded = files.upload()

Saving squad1.1.zip to squad1.1.zip


In [None]:
!unzip squad1.1.zip

Archive:  squad1.1.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


# Loading and inspecting files

In [None]:
import pandas as pd

dev_df = pd.read_json('/content/dev-v1.1.json')
train_df = pd.read_json('/content/train-v1.1.json')

In [None]:
dev_df.head()

Unnamed: 0,data,version
0,"{'title': 'Super_Bowl_50', 'paragraphs': [{'co...",1.1
1,"{'title': 'Warsaw', 'paragraphs': [{'context':...",1.1
2,"{'title': 'Normans', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Nikola_Tesla', 'paragraphs': [{'con...",1.1
4,"{'title': 'Computational_complexity_theory', '...",1.1


In [None]:
train_df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [None]:
print(f'Size of train df: {train_df.size}\nSize of dev df: {dev_df.size}')

Size of train df: 884
Size of dev df: 96


In [None]:
first_entry_dict = train_df.loc[0]['data']

In [None]:
first_entry_paragraph_list = first_entry_dict['paragraphs']
len(first_entry_paragraph_list)

55

In [None]:
first_entry_paragraph_list[2]

{'context': 'The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.',
 'qas': [{'answers': [{'answer_start': 119, 'text': 'Rome'}],
   'question': 'Where is the headquarters of the Congregation of the Holy Cross?',
   'id': '5733bed24776f41900661188'},
  {'answers': [{'answer_start': 145, 'text': 'Moreau Seminary'}],
   'question': 'What is the pri

# First attempt
Simply using off the shelf package with default settings.

Working with `dev df` which is smaller.

Looping over entries and collecting all documents to a list.

In [None]:
docs = []

# Take all contexts from the first entry
for row in dev_df.iterrows():
    for context_container in row[1]['data']['paragraphs']:
        docs.append(context_container['context'])
    break

In [None]:
test_query = dev_df.loc[0]['data']['paragraphs'][0]['qas'][0]['question']
print(test_query)

Which NFL team represented the AFC at Super Bowl 50?


In [None]:
docs[1]

'The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995. The Broncos finished the regular season with a 12–4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that have made eight appearances in the Super Bowl.'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosine_similarities

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
similarities = get_tf_idf_query_similarity(vectorizer, X, test_query)
similarities

array([0.27543024, 0.1759565 , 0.18249404, 0.20133135, 0.09559901,
       0.02674949, 0.1145666 , 0.15295553, 0.1686911 , 0.11432257,
       0.06915032, 0.13653447, 0.17279905, 0.05323222, 0.14545621,
       0.03609297, 0.12941624, 0.04075817, 0.13262051, 0.11675527,
       0.19683775, 0.10714899, 0.22965394, 0.12058915, 0.18740012,
       0.15565143, 0.12038791, 0.10284747, 0.12416753, 0.15133524,
       0.14467454, 0.15670718, 0.13241497, 0.05208188, 0.04342778,
       0.05745362, 0.11678462, 0.04991092, 0.0380399 , 0.0132084 ,
       0.08025308, 0.02072934, 0.13530058, 0.00702467, 0.07187037,
       0.02921069, 0.14580041, 0.03538003, 0.06730245, 0.04144487,
       0.09109213, 0.02710087, 0.02734329, 0.21146647])

In [None]:
np.argmax(similarities)

0

To test this approach, I want to check if the information retrieval is better than random.

In [None]:
num_docs = X.shape[0]
print(f"There are {num_docs} documents.")
print(f"To be better than random, the algo has to match at least {1/num_docs * 100:.2f}% of the questions to the correct document.")

There are 54 documents.
To be better than random, the algo has to match at least 1.85% of the questions to the correct document.


In [None]:
import numpy as np

In [None]:
# Take all questions from all contexts of the first entry
questions, labels = [], []
for row in dev_df.iterrows():
    for label, paragraph in enumerate(row[1]["data"]['paragraphs']):
        for qa_block in paragraph['qas']:
            labels.append(label)
            questions.append(qa_block['question'])
    break
# questions = np.asarray(questions)
labels = np.asarray(labels)

In [None]:
output = []
for question in questions:
    similarities_to_all_contexts = get_tf_idf_query_similarity(vectorizer, X, question)
    most_similar_context = np.argmax(similarities_to_all_contexts)
    output.append(most_similar_context)

In [None]:
output = np.asarray(output)

In [None]:
np.count_nonzero(np.equal(output, labels)) / output.size

0.5703703703703704