This file is going to use BEIR to download a dataset and create embeddings 

# Part 1. embeddings, saving & loading

In [1]:
# constants
DATASET = "scifact"
sbert_model_name = "msmarco-distilbert-base-tas-b"
device = "cpu" # cuda for gpu usage
#  using full dataset
# k_queries = 10000
# k_documents = 522931

In [2]:
# import libraries
from time import time
from beir import util
from beir_reengineered import NewSentenceBERT
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
import os, json, random
import pickle
import json

  from tqdm.autonotebook import tqdm
2024-05-01 08:03:40.899796: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 08:03:40.899852: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 08:03:40.899909: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-01 08:03:40.910076: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#### Download nfcorpus.zip dataset and unzip the dataset
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(DATASET)
out_dir = "datasets"
data_path = util.download_and_unzip(url, out_dir)

datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

In [5]:
#### Provide the data path where nfcorpus has been downloaded and unzipped to the data loader
# data folder would contain these files:
# (1) nfcorpus/corpus.jsonl  (format: jsonlines)
# (2) nfcorpus/queries.jsonl (format: jsonlines)
# (3) nfcorpus/qrels/test.tsv (format: tsv ("\t"))

corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")


  0%|          | 0/5183 [00:00<?, ?it/s]

In [6]:
print(len(corpus), len(queries))

5183 300


In [7]:
#### Dense Retrieval using SBERT (Sentence-BERT) ####
#### Provide any pretrained sentence-transformers model
#### The model was fine-tuned using cosine-similarity.
#### Complete list - https://www.sbert.net/docs/pretrained_models.html

beir_sbert = NewSentenceBERT(sbert_model_name, device=device)
model = DRES(beir_sbert, batch_size=256, corpus_chunk_size=512*9999)

In [9]:
# # Create sub-sample
# subset_of_queries = random.sample(queries.keys(), k_queries)
# queries = {qid: queries[qid] for qid in subset_of_queries}
# qrels = {qid: qrels[qid] for qid in subset_of_queries}
# true_documents = set([docid for qid in qrels for docid in qrels[qid]])
# false_documents = set(random.sample(list(set([docid for docid in corpus if docid not in true_documents])), k_documents))
# subset_of_corpus = true_documents | false_documents
# corpus = {docid: corpus[docid] for docid in subset_of_corpus}

In [10]:
# qrels

In [8]:
# Save qrels to a JSON file
with open("datasets/scifact/qrels_full.json", "w") as f:
    json.dump(qrels, f)

# with open("datasets/scifact/qrels_half_new.json", "r") as f:
#     qrels_json = json.load(f)
# qrels_json


In [9]:
# Encode queries
queries_l = [queries[qid] for qid in queries]
query_embeddings = model.model.encode_queries(
    queries_l,
    batch_size=model.batch_size,
    show_progress_bar=model.show_progress_bar,
    convert_to_tensor=model.convert_to_tensor
).cpu().numpy()

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# Encode documents
corpus_ids = sorted(corpus, key=lambda k: len(corpus[k].get("title", "") + corpus[k].get("text", "")), reverse=True)
corpus_l = [corpus[cid] for cid in corpus_ids]
sub_corpus_embeddings = model.model.encode_corpus(
    corpus_l,
    batch_size=model.batch_size,
    show_progress_bar=model.show_progress_bar,
    convert_to_tensor=model.convert_to_tensor
).cpu().numpy()

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [None]:
# Save as new dataset
os.makedirs("datasets/scifact/qrels", exist_ok=True)
with open("datasets/scifact/queries.jsonl", "w") as f:
    f.writelines([json.dumps({"_id": qid, "text": queries[qid], "metadata":{}})+"\n" for qid in queries])
with open("datasets/scifact/corpus.jsonl", "w") as f:
    f.writelines([json.dumps({"_id": docid, "title": corpus[docid].get("title"), "text": corpus[docid].get("text"), "metadata":{}})+"\n" for docid in corpus])
with open("datasets/scifact/qrels/test.tsv", "w") as f:
    f.write("query-id\tcorpus-id\tscore\n")
    for qid in qrels:
        for docid in qrels[qid]:
            f.write("{}\t{}\t{}\n".format(qid, docid, qrels[qid][docid]))

In [None]:
# # Save embeddings
corpus_embeddings_dict = dict(zip(corpus_ids, sub_corpus_embeddings))
query_embeddings_dict = dict(zip(queries.keys(), query_embeddings))
import pickle

with open("datasets/scifact/corpus_embeddings_full.pkl", "wb") as f:
    pickle.dump(corpus_embeddings_dict, f)
with open("datasets/scifact/query_embeddings_full.pkl", "wb") as f:
    pickle.dump(query_embeddings_dict, f)

10th of data
k_queries = 1000
k_documents = 52293

quater of data
k_queries = 2500
k_documents = 130732

half of data 
k_queries = 5000
k_documents = 261465

# Part 2. (only sometimes) round embeddings

In [20]:
from beir.retrieval.search import BaseSearch # type: ignore beir/retrieval/search/dense/exact_search.py
from beir.util import cos_sim
import torch # type: ignore
import numpy as np
from typing import Dict
import heapq
import logging
logger = logging.getLogger(__name__)

In [21]:
#TODO: refactor to have base class with rounding and noise

# ExperiementRetrievalExactSearch is parent class for any model we are using for our experiement that can be used for retrieval
# Abstract class is BaseSearch
class ExperiementRetrievalExactSearch(BaseSearch):
    def __init__(
            self,
            model,
            path_corpus_embeddings: str,
            path_query_embeddings: str,
            **kwargs):
        #model is class should do nothing
        self.model = model
        self.path_corpus_embeddings = path_corpus_embeddings
        self.path_query_embeddings = path_query_embeddings
        self.show_progress_bar = kwargs.get("show_progress_bar", True)
        self.convert_to_tensor = kwargs.get("convert_to_tensor", True)
        self.results = {}

        logger.info("Load in Encoded Queries and Corpus from Pickle...")
        # Verify file existence
        if not os.path.exists(self.path_corpus_embeddings):
            raise FileNotFoundError(f"File '{self.path_corpus_embeddings}' not found.")
        if not os.path.exists(self.path_query_embeddings):
            raise FileNotFoundError(f"File '{self.path_query_embeddings}' not found.")

        with open(self.path_query_embeddings, "rb") as f:
            self.query_embeddings = pickle.load(f)

        with open(self.path_corpus_embeddings, "rb") as f:
            self.corpus_embeddings = pickle.load(f)



    def add_rounding(self, rounding_decimal: int) -> None:
        # rounding decimal
        if rounding_decimal < 12:
            logger.info("Rounding decimal places of Queries and Corpus...")
            for key, value in self.query_embeddings.items():
                self.query_embeddings[key] = np.round(value, decimals=rounding_decimal)

            for key, value in self.corpus_embeddings.items():
                self.corpus_embeddings[key] = np.round(value, decimals=rounding_decimal)

    def add_noise(self, rounding_decimal: int) -> None:
        logger.info("Adding Noise to Queries and Corpus...")
        for key, value in self.query_embeddings.items():
            self.query_embeddings[key] += np.random.random() / 10**rounding_decimal

        for key, value in self.corpus_embeddings.items():
            self.corpus_embeddings[key] += np.random.random() / 10**rounding_decimal


    def search(self,
               corpus: Dict[str, Dict[str, str]],
               queries: Dict[str, str],
               top_k: int,
               score_function: str,
               return_sorted: bool = False,
               **kwargs) -> Dict[str, Dict[str, float]]:
        # Runs semantic search against the corpus embeddings
        # Returns a ranked list with the corpus ids

        query_ids = list(self.query_embeddings.keys())
        self.results = {qid: {} for qid in query_ids}

        # print("Sorting Corpus by document length (Longest first)...")
        logger.info("Sorting Corpus by document length (Longest first)...")
        corpus_ids = sorted(list(self.corpus_embeddings.keys()), reverse=True)

        result_heaps = {qid: [] for qid in query_ids}  # Keep only the top-k docs for each query

        # Convert dictionary values to PyTorch tensors
        corpus_tensors = [torch.tensor(embedding) for embedding in self.corpus_embeddings.values()]
        query_tensors = [torch.tensor(embedding) for embedding in self.query_embeddings.values()]
        # Stack tensors along a new dimension (batch dimension)
        corpus_embeddings_tensor = torch.stack(corpus_tensors)
        query_embeddings_tensor = torch.stack(query_tensors)

        # print("Compute similarites using  cosine-similarity")
        # Compute similarites using  cosine-similarity
        cos_scores = cos_sim(query_embeddings_tensor, corpus_embeddings_tensor)
        cos_scores[torch.isnan(cos_scores)] = -1

        # print("Get top-k values")
        # Get top-k values
        cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(top_k+1, len(cos_scores[1])), dim=1, largest=True, sorted=return_sorted)
        cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
        cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()

        # print("build heap")
        for query_itr in range(len(query_embeddings_tensor)):
            query_id = query_ids[query_itr]
            for sub_corpus_id, score in zip(cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]):
                corpus_id = corpus_ids[sub_corpus_id]
                if corpus_id != query_id:
                    if len(result_heaps[query_id]) < top_k:
                        # Push item on the heap
                        heapq.heappush(result_heaps[query_id], (score, corpus_id))
                    else:
                        # If item is larger than the smallest in the heap, push it on the heap then pop the smallest element
                        heapq.heappushpop(result_heaps[query_id], (score, corpus_id))

        print("get results heaps")
        for qid in result_heaps:
            for score, corpus_id in result_heaps[qid]:
                self.results[qid][corpus_id] = score

        return self.results

In [22]:
# #### Load the SBERT model and retrieve using cosine-similarity
# model = ExperiementRetrievalExactSearch(beir_sbert)
# retriever = EvaluateRetrieval(model, score_function="cos_sim") # or "cos_sim" for cosine similarity
# results = retriever.retrieve(corpus, queries)



In [23]:
# #### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000]
# ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
# print(ndcg, _map, recall, precision )

In [24]:
beir_sbert = NewSentenceBERT(sbert_model_name, device=device)

In [25]:
model = ExperiementRetrievalExactSearch(
    beir_sbert,
    path_corpus_embeddings = "datasets/scifact/corpus_embeddings_full.pkl",
    path_query_embeddings= "datasets/scifact/query_embeddings_full.pkl") # This can go outside the loop

print("here")
with open("datasets/scifact/qrels_full.json", "r") as f:
    qrels = json.load(f)

with open("datasets/scifact/query_embeddings_full.pkl", "rb") as f:
    query_embeddings = pickle.load(f)

with open("datasets/scifact/corpus_embeddings_full.pkl", "rb") as f:
    corpus_embeddings = pickle.load(f)


rounding_recalls = []
rounding_decimals = range(12,-1,-1)
for decimal in rounding_decimals:
    print(decimal, type(decimal))
    model.add_rounding(decimal)
    print("after add_rounding")
    retriever = EvaluateRetrieval(model, score_function="cos_sim")
    print("after EvaluateRetrieval")
    # print(model.corpus_embeddings, model.query_embeddings)
    # issue maybe you have to pass the correct corpus and queries in
    results = retriever.retrieve(corpus_embeddings, query_embeddings)
    print("after retrieve")
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    print("decimal_places:", decimal , ndcg, _map, recall, precision )
    #recall@100 matters mostr, save to array for plotting
    rounding_recalls.append(recall)


here
12 <class 'int'>
after add_rounding
after EvaluateRetrieval
get results heaps
after retrieve
decimal_places: 12 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 1e-05, 'NDCG@100': 5e-05, 'NDCG@1000': 0.00027} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 1e-05, 'MAP@1000': 1e-05} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 2e-05, 'Recall@100': 0.00023, 'Recall@1000': 0.00193} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 1e-05, 'P@100': 0.0, 'P@1000': 0.0}
11 <class 'int'>
after add_rounding
after EvaluateRetrieval
get results heaps
after retrieve
decimal_places: 11 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 1e-05, 'NDCG@100': 5e-05, 'NDCG@1000': 0.00027} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 1e-05, 'MAP@1000': 1e-05} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 2e-05, 'Recall@100': 0.00023, 'Recall@1000': 0.00193} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 1e-05, 

In [26]:
noise_recalls = []
noise_decimals = range(12,-1,-1)
for decimal in range(12,-1,-1):
    # print(decimal, type(decimal))
    model.add_noise(decimal)
    # model.noise()
    retriever = EvaluateRetrieval(model, score_function="cos_sim")
    results = retriever.retrieve(model.corpus_embeddings, model.query_embeddings)
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    print("noise decimal:", decimal , ndcg, _map, recall, precision )
    noise_recalls.append(recall)

get results heaps
noise decimal: 12 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 1e-05, 'NDCG@100': 8e-05, 'NDCG@1000': 0.00027} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 1e-05, 'MAP@1000': 1e-05} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 2e-05, 'Recall@100': 0.00038, 'Recall@1000': 0.00185} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 1e-05, 'P@100': 1e-05, 'P@1000': 0.0}
get results heaps
noise decimal: 11 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 1e-05, 'NDCG@100': 8e-05, 'NDCG@1000': 0.00027} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 1e-05, 'MAP@1000': 1e-05} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 2e-05, 'Recall@100': 0.00038, 'Recall@1000': 0.00185} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 1e-05, 'P@100': 1e-05, 'P@1000': 0.0}
get results heaps
noise decimal: 10 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 1e-05, 'NDCG@100': 8e-05, 'NDCG

In [27]:
import csv
# Headers for CSV file
headers = ['Rounding_Decimals', 'Recall@100', 'Recall@1000']

# Create and write data to CSV file
with open('results/scifact/rounding_recalls_full.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()

    for rounding_decimal, rounding_recall in zip(rounding_decimals, rounding_recalls):
        row_data = {
            'Rounding_Decimals': rounding_decimal,
            'Recall@100': rounding_recall['Recall@100'],
            'Recall@1000': rounding_recall['Recall@1000']
        }
        writer.writerow(row_data)

In [28]:
# Headers for CSV file
headers = ['Rounding_Decimals', 'Recall@100', 'Recall@1000']

# Create and write data to CSV file
with open('results/scifact/noise_recall_full.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()

    for rounding_decimal, noise_recall in zip(rounding_decimals, noise_recalls):
        row_data = {
            'Rounding_Decimals': rounding_decimal,
            'Recall@100': noise_recall['Recall@100'],
            'Recall@1000': noise_recall['Recall@1000']
        }
        writer.writerow(row_data)

In [31]:
# # PSIRetrievalExactSearch is parent class for any model we are using for our experiement that can be used for retrieval
# # Abstract class is BaseSearch
# class PSIRetrievalExactSearch(BaseSearch):
#     def __init__(
#             self,
#             model,
#             path_corpus_embeddings: str = "datasets/subquora/corpus_embeddings.pkl",
#             path_query_embeddings: str = "datasets/subquora/query_embeddings.pkl",
#             **kwargs):
#         #model is class should do nothing
#         self.model = model
#         self.path_corpus_embeddings = path_corpus_embeddings
#         self.path_query_embeddings = path_query_embeddings
#         self.show_progress_bar = kwargs.get("show_progress_bar", True)
#         self.convert_to_tensor = kwargs.get("convert_to_tensor", True)
#         self.results = {}

#         logger.info("Load in Encoded Queries and Corpus from Pickle...")
#         # Verify file existence
#         if not os.path.exists(self.path_corpus_embeddings):
#             raise FileNotFoundError(f"File '{self.path_corpus_embeddings}' not found.")
#         if not os.path.exists(self.path_query_embeddings):
#             raise FileNotFoundError(f"File '{self.path_query_embeddings}' not found.")

#         with open(self.path_query_embeddings, "rb") as f:
#             self.query_embeddings = pickle.load(f)

#         with open(self.path_corpus_embeddings, "rb") as f:
#             self.corpus_embeddings = pickle.load(f)

#     def add_rounding(self, rounding_decimal: int) -> None:
#         # rounding decimal
#         if rounding_decimal < 12:
#             logger.info("Rounding decimal places of Queries and Corpus...")
#             for key, value in self.query_embeddings.items():
#                 self.query_embeddings[key] = np.round(value, decimals=rounding_decimal)

#             for key, value in self.corpus_embeddings.items():
#                 self.corpus_embeddings[key] = np.round(value, decimals=rounding_decimal)

#     def add_noise(self, rounding_decimal: int) -> None:
#         logger.info("Adding Noise to Queries and Corpus...")
#         for key, value in self.query_embeddings.items():
#             self.query_embeddings[key] += np.random.random() / 10**rounding_decimal

#         for key, value in self.corpus_embeddings.items():
#             self.corpus_embeddings[key] += np.random.random() / 10**rounding_decimal

#     def search(self,
#                corpus: Dict[str, Dict[str, str]],
#                queries: Dict[str, str],
#                top_k: int,
#                score_function: str,
#                return_sorted: bool = False,
#                **kwargs) -> Dict[str, Dict[str, float]]:

#         query_ids = list(self.query_embeddings.keys())
#         self.results = {qid: {} for qid in query_ids}

#         corpus_set = set(self.corpus_embeddings)
#         query_set = set(self.query_embeddings)
#         overlap = query_set.intersection(corpus_set)


#         for qid in overlap:
#             for score, corpus_id in overlap[qid]:
#                 self.results[qid][corpus_id] = 1

#         return self.results

In [32]:
model = PSIRetrievalExactSearch(
    beir_sbert,
    path_corpus_embeddings = "datasets/subquora/corpus_embeddings_full.pkl",
    path_query_embeddings= "datasets/subquora/query_embeddings_full.pkl") # This can go outside the loop

print("here")
with open("datasets/subquora/qrels_full.json", "r") as f:
    qrels = json.load(f)

with open("datasets/subquora/query_embeddings_full.pkl", "rb") as f:
    query_embeddings = pickle.load(f)

with open("datasets/subquora/corpus_embeddings_full.pkl", "rb") as f:
    corpus_embeddings = pickle.load(f)


rounding_recalls_PSI = []
rounding_decimals = range(12,-1,-1)
for decimal in rounding_decimals:
    print(decimal, type(decimal))
    model.add_rounding(decimal)
    print("after add_rounding")
    retriever = EvaluateRetrieval(model, score_function="cos_sim")
    print("after EvaluateRetrieval")
    # print(model.corpus_embeddings, model.query_embeddings)
    # issue maybe you have to pass the correct corpus and queries in
    results = retriever.retrieve(corpus_embeddings, query_embeddings)
    print("after retrieve")
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    print("decimal_places:", decimal , ndcg, _map, recall, precision )
    #recall@100 matters mostr, save to array for plotting
    rounding_recalls_PSI.append(recall)


here
12 <class 'int'>
after add_rounding
after EvaluateRetrieval
after retrieve
decimal_places: 12 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0, 'NDCG@1000': 0.0} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.0, 'MAP@1000': 0.0} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 0.0, 'Recall@100': 0.0, 'Recall@1000': 0.0} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1000': 0.0}
11 <class 'int'>
after add_rounding
after EvaluateRetrieval
after retrieve
decimal_places: 11 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0, 'NDCG@1000': 0.0} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.0, 'MAP@1000': 0.0} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 0.0, 'Recall@100': 0.0, 'Recall@1000': 0.0} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1000': 0.0}
10 <class 'int'>
after add_rounding
after EvaluateRetri

In [33]:
noise_recalls_PSI = []
noise_decimals = range(12,-1,-1)
for decimal in range(12,-1,-1):
    # print(decimal, type(decimal))
    model.add_noise(decimal)
    # model.noise()
    retriever = EvaluateRetrieval(model, score_function="cos_sim")
    results = retriever.retrieve(model.corpus_embeddings, model.query_embeddings)
    ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
    print("noise decimal:", decimal , ndcg, _map, recall, precision )
    noise_recalls_PSI.append(recall)

noise decimal: 12 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0, 'NDCG@1000': 0.0} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.0, 'MAP@1000': 0.0} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 0.0, 'Recall@100': 0.0, 'Recall@1000': 0.0} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1000': 0.0}
noise decimal: 11 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0, 'NDCG@1000': 0.0} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.0, 'MAP@1000': 0.0} {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 0.0, 'Recall@100': 0.0, 'Recall@1000': 0.0} {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'P@100': 0.0, 'P@1000': 0.0}
noise decimal: 10 {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0, 'NDCG@1000': 0.0} {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.0, 'MAP@1000': 0.0} {'Reca

In [34]:
import csv
# Headers for CSV file
headers = ['Rounding_Decimals', 'Recall@100', 'Recall@1000']

# Create and write data to CSV file
with open('results/PSI_rounding_recalls_full.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()

    for rounding_decimal, rounding_recall in zip(rounding_decimals, rounding_recalls_PSI):
        row_data = {
            'Rounding_Decimals': rounding_decimal,
            'Recall@100': rounding_recall['Recall@100'],
            'Recall@1000': rounding_recall['Recall@1000']
        }
        writer.writerow(row_data)

In [None]:
# Headers for CSV file
headers = ['Rounding_Decimals', 'Recall@100', 'Recall@1000']

# Create and write data to CSV file
with open('results/PSI_noise_recall_full.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=headers)
    writer.writeheader()

    for rounding_decimal, noise_recall in zip(rounding_decimals, noise_recalls_PSI):
        row_data = {
            'Rounding_Decimals': rounding_decimal,
            'Recall@100': noise_recall['Recall@100'],
            'Recall@1000': noise_recall['Recall@1000']
        }
        writer.writerow(row_data)