In [None]:
%pip install datasets==2.13.1

Collecting datasets==2.13.1
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
                                              0.0/486.2 kB ? eta -:--:--
                                              0.0/486.2 kB ? eta -:--:--
                                              0.0/486.2 kB ? eta -:--:--
     ----                                     51.2/486.2 kB ? eta -:--:--
     ----                                    61.4/486.2 kB 1.6 MB/s eta 0:00:01
     -----                                 71.7/486.2 kB 975.2 kB/s eta 0:00:01
     -----                                 71.7/486.2 kB 975.2 kB/s eta 0:00:01
     -----                                 71.7/486.2 kB 975.2 kB/s eta 0:00:01
     -------                               92.2/486.2 kB 435.7 kB/s eta 0:00:01
     -------                               92.2/486.2 kB 435.7 kB/s eta 0:00:01
     --------                             112.6/486.2 kB 363.1 kB/s eta 0:00:02
     --------                             112.6/486.2 kB 363.1 

In [None]:
from datasets import load_dataset

dataset = load_dataset('ms_marco','v1.1')

In [None]:
subset = dataset['test']

In [None]:
queries_infos = []
queries = []
corpus = []

for sample in subset:
    query_type = sample['query_type']
    if query_type != 'entity':
        continue
    query_id = sample['query_id']
    query_str = sample['query']
    passages_dict = sample['passages']
    is_selected_lst = passages_dict['is_selected']
    passage_text_lst = passages_dict['passage_text']
    query_info = {
        'query_id': query_id,
        'query': query_str,
        'relevant_docs': []
    }
    current_len_corpus = len(corpus)
    for idx in range(len(is_selected_lst)):
        if is_selected_lst[idx] == 1:
            doc_idx = current_len_corpus + idx
            query_info['relevant_docs'].append(doc_idx)

    if query_info['relevant_docs'] == []:
        continue

    queries.append(query_str)
    queries_infos.append(query_info)
    corpus += passage_text_lst

# Text Normalization

In [None]:
def tokenize(text):
    return text.split()

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
english_stopwords = stopwords.words('english')
remove_chars = string.punctuation
stemmer = PorterStemmer()

def text_normalize(text):
    text = text.lower()
    for char in remove_chars:
        text = text.replace(char, '')
    text = ' '.join([word for word in tokenize(text) if word not in english_stopwords])
    text = ' '.join([stemmer.stem(word) for word in tokenize(text)])

    return text

# Create Dictionary

In [None]:
def create_dictionary(corpus):
    dictionary = []
    for doc in corpus:
        normalized_doc = text_normalize(doc)
        tokens = tokenize(normalized_doc)
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

In [None]:
%%time
dictionary = create_dictionary(corpus)

# Create Doc-Term Matrix

In [None]:
def vectorize(text, dictionary):
    word_count_dict = {word: 0 for word in dictionary}
    normalized_text = text_normalize(text)
    tokens = tokenize(normalized_text)
    for token in tokens:
        try:
            word_count_dict[token] += 1
        except:
            pass

    vector = list(word_count_dict.values())

    return vector

In [None]:
def create_doc_term_matrix(corpus, dictionary):
    doc_term_matrix = {}
    for idx, doc in enumerate(corpus):
        vector = vectorize(doc, dictionary)
        doc_term_matrix[(doc, idx)] = vector

    return doc_term_matrix

# Ranking

In [None]:
from scipy import spatial

def similarity(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [None]:
def ranking(query, dictionary, doc_term_matrix):
    query_vec = vectorize(query, dictionary)
    scores = []
    for doc_info, doc_vec in doc_term_matrix.items():
        sim = similarity(query_vec, doc_vec)
        scores.append((sim, doc_info))
    scores.sort(reverse=True)

    return scores

In [None]:
query_lst = ['what is the official language in Fiji']
top_k = 10
for query in query_lst:
    scores = ranking(query, dictionary, doc_term_matrix)
    print(f'Query: {query}')
    print('=== Relevant docs ===')
    for idx in range(top_k):
        doc_score = scores[idx][0]
        doc_content = scores[idx][1][0]

        print(f'Top {idx + 1}; Score: {doc_score:.4f}')
        print(doc_content)
        print('\n')

# Text Retrieval with Sentence Transformers

In [None]:
!pip install datasets sentence_transformers

In [None]:
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [None]:
from sentence_transformers import util

def similarity(query_embeddings, corpus_embeddings):
    return util.cos_sim(query_embeddings, corpus_embeddings)[0]

In [None]:
def ranking(query, top_k=10):
    query_embeddings = model.encode(
        query,
        convert_to_tensor=True
    )
    cos_scores = similarity(
        query_embeddings,
        corpus_embeddings
    )
    top_results = torch.topk(cos_scores, k=top_k)

    return top_results

In [None]:
custom_queries = ['what is the official language in Fiji']

top_k = min(5, len(corpus))
for query in custom_queries:
    top_results = ranking(query, top_k)

    print("Query:", query)
    print("\n======================")
    print(f"Top {top_k} most similar sentences in corpus:\n")

    for idx, (score, doc_idx) in enumerate(
        zip(top_results[0], top_results[1])
    ):
        print(f'Document rank {idx + 1}:')
        print(corpus[doc_idx], f'\n(Score: {score:.4f})', '\n')