In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
docs = pd.read_csv('/content/drive/My Drive/dataset/hw2_docs.csv')
docs

Unnamed: 0,doc_id,document
0,2p7qrgx0,"Since 2007, many cases of fever, thrombocytope..."
1,25dcnext,BACKGROUND: Respiratory viral (RV) outbreaks i...
2,2jq626ye,A novel coronavirus (2019-nCoV) originating in...
3,270msv5l,• Several studies suggested Baricitinib as a p...
4,14x4uqq7,Evidence from the 2003 SARS epidemic and 2009 ...
...,...,...
745,1ebkagvv,OBJECTIVE: To retrospectively analyze the ches...
746,80dfqjql,Summary The novel human coronavirus SARS-CoV-2...
747,0fzwwluc,Objectives: Patients with novel coronavirus di...
748,105q161g,"A number of virological, epidemiological and e..."


In [None]:
queries = pd.read_csv("/content/drive/My Drive/dataset/hw2_queries.csv")
queries

Unnamed: 0,query_id,query
0,1,what is the origin of COVID-19
1,2,how does the coronavirus respond to changes in...
2,3,will SARS-CoV2 infected people develop immunit...
3,4,what causes death from Covid-19?
4,5,what drugs have been active against SARS-CoV o...
5,6,what types of rapid testing for Covid-19 have ...
6,7,are there serological tests that detect antibo...
7,8,how has lack of testing availability led to un...
8,9,how has COVID-19 affected Canada
9,10,has social distancing had an impact on slowing...


In [None]:
qrels = pd.read_csv("/content/drive/My Drive/dataset/hw2_qrels.csv")
qrels

Unnamed: 0,query_id,doc_id
0,1,005b2j4b
1,1,0chuwvg6
2,1,0t2a5500
3,1,0y34yxlb
4,1,105q161g
...,...,...
745,50,xhm97wy2
746,50,xieqswct
747,50,y87tq9wu
748,50,ygwdldae


In [None]:
def precision_at_k(gold, retrieved, k):
    ''' returns avarage of precision at rank k from a gold and retrieved data.
    '''

    sum_precision_at_k = 0
    count_q = 0
    for q in retrieved:

        top_ten = (sorted(retrieved[q], key=lambda item: item[1], reverse=True)[
                   :10])

        gold_qs = list(gold.loc[gold['query_id'] == q].doc_id)

        if len(gold_qs) != 0:
            count_q = count_q + 1
        else:
            continue

        sum_correct = 0
        for idx, (result_id, result_text) in enumerate(top_ten):
            if result_id in gold_qs:
                sum_correct = sum_correct + 1
            if idx == k - 1:
                sum_precision_at_k = sum_precision_at_k + (sum_correct / k)
                break

    return sum_precision_at_k / count_q

In [None]:
def mean_average_precision(gold, retrieved):
    ''' returns avarage of avarage precisions from a gold and retrieved data.
    '''

    sum_average_precision = 0
    count_q = 0
    for q in retrieved:

        top_ten = (sorted(retrieved[q], key=lambda item: item[1], reverse=True)[
                   :10])
        gold_qs = list(gold.loc[gold['query_id'] == q].doc_id)

        if len(gold_qs) != 0:
            count_q = count_q + 1
        else:
            continue

        sum_precision = 0
        sum_correct = 0

        for idx, (result_id, result_text) in enumerate(top_ten):
            if result_id in gold_qs:
                sum_correct = sum_correct + 1
                sum_precision = sum_precision + (sum_correct / (idx + 1))
        if sum_correct != 0:
            sum_average_precision = sum_average_precision + \
                                    (sum_precision / sum_correct)

    return sum_average_precision / count_q

In [None]:
def mean_reciprocal_rank(gold, retrieved):
    ''' returns avarage of reciprocal ranks from a gold and retrieved data.
    for all evaluation methods, retrieved dictionary must have queries as keys, and list of (doc id , p) as values.
    '''

    sum_reciprocal_rank = 0
    count_q = 0
    for q in retrieved:

        # get top ten values from retrieved dictionary for every query
        top_ten = (
            sorted(retrieved[q], key=lambda item: item[1], reverse=True)[:10])

        # get gold data for every query from gold dictionary
        gold_qs = list(gold.loc[gold['query_id'] == q].doc_id)

        # ignore queries that don't have any gold data with is_duplicate set to 1
        if len(gold_qs) != 0:
            count_q = count_q + 1
        else:
            continue

        reciprocal_rank = 0
        # find the first relevant retrieved doc and compute reciprocal rank and add it to sum
        for idx, (result_id, result_text) in enumerate(top_ten):
            if result_id in gold_qs:
                reciprocal_rank = 1 / (idx + 1)
                sum_reciprocal_rank = sum_reciprocal_rank + reciprocal_rank
                break

    # avarage of reciprocal ranks
    return sum_reciprocal_rank / count_q

## Word2Vec — Skip-Gram

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np

In [None]:
import gensim.downloader as api

# Download pretrained Google word2vec model
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

In [None]:
path = "/content/drive/MyDrive/word2vec-google-news-300.gz"

In [None]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
def calculate_doc_vector(tokens):

    word_vectors = [model[word] for word in tokens if word in model]

    if not word_vectors:
        return None  # None if none of the words in the document are in the model

    return np.mean(word_vectors, axis=0)

In [None]:
dict_voc={}
for d_ind in docs.index:
    d_id = docs['doc_id'][d_ind]
    d_doc = docs['document'][d_ind]
    doc_words = d_doc.lower().split()

    d_vector = calculate_doc_vector(doc_words)
    dict_voc[d_id] = d_vector

In [None]:
import math
from numpy import dot
from numpy.linalg import norm

def compute_cosine_similarity(v1,v2):
    return dot(v1, v2)/(norm(v1)*norm(v2))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

result = {}

for q_ind in queries.index:

    q_id = queries['query_id'][q_ind]
    q_doc = queries['query'][q_ind]
    result[q_id] = []

    q_words = q_doc.lower().split()

    q_vector = calculate_doc_vector(q_words)

    if q_vector is not None:

      for d_ind in docs.index:

        d_id = docs['doc_id'][d_ind]
        if dict_voc[d_id] is not None:
          p =compute_cosine_similarity(q_vector,dict_voc[d_id])

          result[q_id].append((d_id, p))

In [None]:

mrr = mean_reciprocal_rank(qrels, result)
map = mean_average_precision(qrels, result)
p_at_5 = precision_at_k(qrels, result, 5)
p_at_10 = precision_at_k(qrels, result, 10)
print(" MRR:{:.2f} --- MAP:{:.2f} --- P@5:{:.2f} --- P@10:{:.2f}\n".format(mrr, map,
                                                                           p_at_5, p_at_10))

 MRR:0.60 --- MAP:0.53 --- P@5:0.37 --- P@10:0.29


## word2vec (Tf-idf)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
tokenized_documents = [docs['document'][d_ind] for d_ind in docs.index]

tfidf_matrix = vectorizer.fit_transform(tokenized_documents )

In [None]:
def calculate_weighted_document_vector(words, vectorizer):
    word_vectors = [model[word] * vectorizer.idf_[vectorizer.vocabulary_[word]]
                    for word in words if word in model and word in vectorizer.vocabulary_]

    if not word_vectors:
        return None

    weights = np.array([vectorizer.idf_[vectorizer.vocabulary_[word]] for word in words
                       if word in model and word in vectorizer.vocabulary_])

    weighted_avg = np.average(word_vectors, axis=0, weights=weights)
    return weighted_avg


In [None]:
dict_voc={}
for d_ind in docs.index:
    d_id = docs['doc_id'][d_ind]
    d_doc = docs['document'][d_ind]
    doc_words = d_doc.lower().split()

    d_vector = calculate_weighted_document_vector(doc_words,vectorizer)
    dict_voc[d_id] = d_vector

In [None]:
result = {}

for q_ind in queries.index:

    q_id = queries['query_id'][q_ind]
    q_doc = queries['query'][q_ind]
    result[q_id] = []

    q_words = q_doc.split()

    q_vector = calculate_weighted_document_vector(q_words,vectorizer)

    if q_vector is not None:

      for d_ind in docs.index:

        d_id = docs['doc_id'][d_ind]

        if dict_voc[d_id] is not None:

           p = compute_cosine_similarity(q_vector,dict_voc[d_id])

           result[q_id].append((d_id, p))

In [None]:
mrr = mean_reciprocal_rank(qrels, result)
map = mean_average_precision(qrels, result)
p_at_5 = precision_at_k(qrels, result, 5)
p_at_10 = precision_at_k(qrels, result, 10)
print(" MRR:{:.2f} --- MAP:{:.2f} --- P@5:{:.2f} --- P@10:{:.2f}\n".format(mrr, map,
                                                                           p_at_5, p_at_10))

 MRR:0.66 --- MAP:0.57 --- P@5:0.38 --- P@10:0.31


BERT

In [None]:
!pip install transformers requests beautifulsoup4 pandas numpy



In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116


In [None]:
from transformers import BertModel, BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model =BertModel.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import torch
import numpy as np

def get_bert_embedding(text):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
document_embeddings = {docs['doc_id'][d_ind]:get_bert_embedding(docs['document'][d_ind]) for d_ind in docs.index}

In [None]:
query_embeddings = {queries['query_id'][q_ind]:get_bert_embedding(queries['query'][q_ind]) for q_ind in queries.index}

In [None]:
import math
from numpy import dot
from numpy.linalg import norm

def compute_cosine_similarity(v1,v2):
    return dot(v1, v2)/(norm(v1)*norm(v2))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

result = {}

for q_ind in queries.index:

    q_id = queries['query_id'][q_ind]
    result[q_id] = []
    for d_ind in docs.index:

        d_id = docs['doc_id'][d_ind]

        p = cosine_similarity(query_embeddings[q_id].reshape(1, -1),document_embeddings[d_id].reshape(1, -1))

        result[q_id].append((d_id, p))

In [None]:
mrr = mean_reciprocal_rank(qrels, result)
map = mean_average_precision(qrels, result)
p_at_5 = precision_at_k(qrels, result, 5)
p_at_10 = precision_at_k(qrels, result, 10)
print(" MRR:{:.2f} --- MAP:{:.2f} --- P@5:{:.2f} --- P@10:{:.2f}\n".format(mrr, map,
                                                                           p_at_5, p_at_10))

 MRR:0.42 --- MAP:0.35 --- P@5:0.18 --- P@10:0.18
