## **1. Download MS MACRO Dataset**

In [None]:
!pip install datasets==2.13.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==2.13.1
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.13.1)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.13.1)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets==2.13.1)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m16.5 MB/s

In [None]:
from datasets import load_dataset

dataset = load_dataset('ms_marco', 'v1.1')

Downloading builder script:   0%|          | 0.00/8.52k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.15k [00:00<?, ?B/s]

Downloading and preparing dataset ms_marco/v1.1 to /root/.cache/huggingface/datasets/ms_marco/v1.1/1.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/111M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

Dataset ms_marco downloaded and prepared to /root/.cache/huggingface/datasets/ms_marco/v1.1/1.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
subset = dataset['test']

In [None]:
queries_infos = []
queries = []
corpus = []

for sample in subset:
    query_type = sample['query_type']
    if query_type != 'entity':
        continue
    query_id = sample['query_id']
    query_str = sample['query']
    passages_dict = sample['passages']
    is_selected_lst = passages_dict['is_selected']
    passage_text_lst = passages_dict['passage_text']
    query_info = {
        'query_id': query_id,
        'query': query_str,
        'relevant_docs': []
    }
    current_len_corpus = len(corpus)
    for idx in range(len(is_selected_lst)):
        if is_selected_lst[idx] == 1:
            doc_idx = current_len_corpus + idx
            query_info['relevant_docs'].append(doc_idx)

    if query_info['relevant_docs'] == []:
        continue

    queries.append(query_str)
    queries_infos.append(query_info)
    corpus += passage_text_lst

## **2. Text Normalization**

In [None]:
def tokenize(text):
    return text.split()

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
english_stopwords = stopwords.words('english')
remove_chars = string.punctuation
stemmer = PorterStemmer()

def text_normalize(text):
    text = text.lower()
    for char in remove_chars:
        text = text.replace(char, '')
    text = ' '.join([word for word in tokenize(text) if word not in english_stopwords])
    text = ' '.join([stemmer.stem(word) for word in tokenize(text)])

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **3. Create dictionary**

In [None]:
def create_dictionary(corpus):
    dictionary = []
    for doc in corpus:
        normalized_doc = text_normalize(doc)
        tokens = tokenize(normalized_doc)
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

In [None]:
%%time
dictionary = create_dictionary(corpus)

CPU times: user 28.3 s, sys: 86.2 ms, total: 28.4 s
Wall time: 33.1 s


## **4. Create Doc-Term Matrix**

In [None]:
def vectorize(text, dictionary):
    word_count_dict = {word: 0 for word in dictionary}
    normalized_text = text_normalize(text)
    tokens = tokenize(normalized_text)
    for token in tokens:
        try:
            word_count_dict[token] += 1
        except:
            pass

    vector = list(word_count_dict.values())

    return vector

In [None]:
def create_doc_term_matrix(corpus, dictionary):
    doc_term_matrix = {}
    for idx, doc in enumerate(corpus):
        vector = vectorize(doc, dictionary)
        doc_term_matrix[(doc, idx)] = vector

    return doc_term_matrix

In [None]:
%%time
doc_term_matrix = create_doc_term_matrix(corpus, dictionary)

CPU times: user 28.6 s, sys: 1.04 s, total: 29.6 s
Wall time: 29.8 s


## **5. Ranking**

In [None]:
from scipy import spatial

def similarity(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [None]:
def ranking(query, dictionary, doc_term_matrix):
    query_vec = vectorize(query, dictionary)
    scores = []
    for doc_info, doc_vec in doc_term_matrix.items():
        sim = similarity(query_vec, doc_vec)
        scores.append((sim, doc_info))
    scores.sort(reverse=True)

    return scores

In [None]:
query_lst = ['what is the official language in Fiji']
top_k = 10
for query in query_lst:
    scores = ranking(query, dictionary, doc_term_matrix)
    print(f'Query: {query}')
    print('=== Relevant docs ===')
    for idx in range(top_k):
        doc_score = scores[idx][0]
        doc_content = scores[idx][1][0]

        print(f'Top {idx + 1}; Score: {doc_score:.4f}')
        print(doc_content)
        print('\n')

Query: what is the official language in Fiji
=== Relevant docs ===
Top 1; Score: 0.6556
The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.


Top 2; Score: 0.6556
The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.


Top 3; Score: 0.5715
The official languages. Fiji’s 1997 Constitution established Fijian as one of the official languages of the country. Fijian is an Austronesian language, a grouping that includes thousands of other languages spanning the globe. The language is of the Malayo-Polynesian family, not too diff