In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

In [21]:
from scipy.sparse import csr_matrix

In [2]:
from google.colab import files
uploaded = files.upload()

In [3]:
!unzip squad1.1.zipn

Archive:  squad1.1.zip
replace dev-v1.1.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# Loading and inspecting files

In [23]:
import pandas as pd

dev_df = pd.read_json('/content/dev-v1.1.json')
train_df = pd.read_json('/content/train-v1.1.json')

In [4]:
dev_df.head()

Unnamed: 0,data,version
0,"{'title': 'Super_Bowl_50', 'paragraphs': [{'co...",1.1
1,"{'title': 'Warsaw', 'paragraphs': [{'context':...",1.1
2,"{'title': 'Normans', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Nikola_Tesla', 'paragraphs': [{'con...",1.1
4,"{'title': 'Computational_complexity_theory', '...",1.1


In [5]:
train_df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [None]:
print(f'Size of train df: {train_df.size}\nSize of dev df: {dev_df.size}')

Size of train df: 884
Size of dev df: 96


In [None]:
first_entry_dict = train_df.loc[0]['data']

In [None]:
first_entry_paragraph_list = first_entry_dict['paragraphs']
len(first_entry_paragraph_list)

55

In [None]:
first_entry_paragraph_list[2]

{'context': 'The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.',
 'qas': [{'answers': [{'answer_start': 119, 'text': 'Rome'}],
   'question': 'Where is the headquarters of the Congregation of the Holy Cross?',
   'id': '5733bed24776f41900661188'},
  {'answers': [{'answer_start': 145, 'text': 'Moreau Seminary'}],
   'question': 'What is the pri

### Utilities

In [10]:
def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosine_similarities

# First attempt
Simply using off the shelf package with default settings.

Working with `dev df` which is smaller.

Looping over entries and collecting all documents to a list.

In [None]:
docs = []

# Take all contexts from the first entry
for row in dev_df.iterrows():
    for context_container in row[1]['data']['paragraphs']:
        docs.append(context_container['context'])
    break

In [None]:
docs[1]

'The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship Game and advanced to their second Super Bowl appearance since the franchise was founded in 1995. The Broncos finished the regular season with a 12–4 record, and denied the New England Patriots a chance to defend their title from Super Bowl XLIX by defeating them 20–18 in the AFC Championship Game. They joined the Patriots, Dallas Cowboys, and Pittsburgh Steelers as one of four teams that have made eight appearances in the Super Bowl.'

In [None]:
test_query = dev_df.loc[0]['data']['paragraphs'][0]['qas'][0]['question']
print(test_query)

Which NFL team represented the AFC at Super Bowl 50?


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
similarities = get_tf_idf_query_similarity(vectorizer, X, test_query)
similarities

array([0.27543024, 0.1759565 , 0.18249404, 0.20133135, 0.09559901,
       0.02674949, 0.1145666 , 0.15295553, 0.1686911 , 0.11432257,
       0.06915032, 0.13653447, 0.17279905, 0.05323222, 0.14545621,
       0.03609297, 0.12941624, 0.04075817, 0.13262051, 0.11675527,
       0.19683775, 0.10714899, 0.22965394, 0.12058915, 0.18740012,
       0.15565143, 0.12038791, 0.10284747, 0.12416753, 0.15133524,
       0.14467454, 0.15670718, 0.13241497, 0.05208188, 0.04342778,
       0.05745362, 0.11678462, 0.04991092, 0.0380399 , 0.0132084 ,
       0.08025308, 0.02072934, 0.13530058, 0.00702467, 0.07187037,
       0.02921069, 0.14580041, 0.03538003, 0.06730245, 0.04144487,
       0.09109213, 0.02710087, 0.02734329, 0.21146647])

In [None]:
np.argmax(similarities)

0

To test this approach, I want to check if the information retrieval is better than random.

In [None]:
num_docs = X.shape[0]
print(f"There are {num_docs} documents.")
print(f"To be better than random, the algo has to match at least {1/num_docs * 100:.2f}% of the questions to the correct document.")

There are 54 documents.
To be better than random, the algo has to match at least 1.85% of the questions to the correct document.


In [None]:
# Take all questions from all contexts of the first entry
questions, labels = [], []
for row in dev_df.iterrows():
    for label, paragraph in enumerate(row[1]["data"]['paragraphs']):
        for qa_block in paragraph['qas']:
            labels.append(label)
            questions.append(qa_block['question'])
    break
# questions = np.asarray(questions)
labels = np.asarray(labels)

In [None]:
output = []
for question in questions:
    similarities_to_all_contexts = get_tf_idf_query_similarity(vectorizer, X, question)
    most_similar_context = np.argmax(similarities_to_all_contexts)
    output.append(most_similar_context)

In [None]:
output = np.asarray(output)

In [None]:
np.count_nonzero(np.equal(output, labels)) / output.size

0.5703703703703704

# Scaling implementation to entire dataset
Steps:
1. Collect all documents in a list
2. Create TF-IDF matrix from doc list
3. Collect all questions and their matching label to two lists
4. Compute the vector representations for all questions
5. Compute cosine similarities between each question representation and the document tf-idf matrix
6. Take the index of the max value for each row of similarities
7. Compute accuracies

In [24]:
docs = []

df = train_df

# 1.
for row in df.iterrows():
    for context_container in row[1]['data']['paragraphs']:
        docs.append(context_container['context'])

In [25]:
# 2.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

In [26]:
# 3.
questions, labels = [], []
# Each row contains a set of paragraphs.
# The outer for loop iterates through the rows
for row_num, row in enumerate(df.iterrows()):
    # This for iterates through the paragraphs contained in a single row
    for paragraph_num, paragraph in enumerate(row[1]["data"]['paragraphs']):
        # Each paragraph has multiple associated questions.
        # The for loop iterates through them and collects them into a list.
        # For each question, the number of the paragraph is saved to another list
        for qa_block in paragraph['qas']:
            labels.append(row_num + paragraph_num)
            questions.append(qa_block['question'])
labels = np.asarray(labels)

In [None]:
# 4. 5. 6.
output = []
for question in tqdm(questions):
    similarities_to_all_contexts = get_tf_idf_query_similarity(vectorizer, X, question)
    most_similar_context = np.argmax(similarities_to_all_contexts)
    output.append(most_similar_context)

The above looping would take 1h 30 mins to find the best matching result for all 80.000 questions contained in the training set.
A simple heuristic could speed up the process: group all paragraphs from the same entry into a single text. When the right text has been found, discriminate the right paragraph within that text.

Maybe tuning the parameters of the tf idf implementation can help. Let's check the performance increase from stopword removal.

In [27]:
vectorizer_without_stop_words = TfidfVectorizer(stop_words='english')
X_without_stop_words = vectorizer_without_stop_words.fit_transform(docs)

Let's compare the sizes of the two TF IDF matrices.

In [28]:
print(f"Original tf idf matrix size:{X.shape}\nTf idf without stop words:{X_without_stop_words.shape}")

Original tf idf matrix size:(18896, 78537)
Tf idf without stop words:(18896, 78230)


## Optimizing what works
The default english stop word parameter does not seem to make a big difference. Only about 300 words got removed.

Since we are working with matrices, we could leverage linear algebra in place of slow for loops to speed up computations.

In [None]:
def get_tf_idf_bulk_similarities(vectorizer, docs_tfidf, queries_list):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform(queries_list)
    cosine_similarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosine_similarities

In [27]:
vectorized_questions = vectorizer.transform(questions)

In [8]:
print(f"Vectorized questions shape:{vectorized_questions.shape}\nVectorized paragraphs shape:{X.shape}")

Vectorized questions shape:(87599, 78537)
Vectorized paragraphs shape:(18896, 78537)


In [28]:
vectorized_questions[0:1000].shape

(1000, 78537)

In [30]:
csr_matrix.argmax(cosine_similarity(vectorized_questions[:1000], X, dense_output=False), axis=1).shape

(1000, 1)

In [32]:
outputs = []
step = int(1e4)
stop = int(9e4)
for i in tqdm(range(0, stop, step)):
    outputs.append(csr_matrix.argmax(cosine_similarity(vectorized_questions[i:i+step], X, dense_output=False), axis=1))

100%|██████████| 9/9 [02:23<00:00, 15.93s/it]


In [38]:
output = np.vstack(outputs)

In [49]:
output.shape

(87599, 1)

In [52]:
labels.reshape(-1,1).shape

(87599, 1)

In [53]:
np.equal(output, labels.reshape(-1,1)).shape

(87599, 1)

In [54]:
np.count_nonzero(np.equal(output, labels.reshape(-1,1))) / output.size

0.000993162022397516

# Testing implementation with stop word removal