## import packages

In [2]:
import pandas as pd
from tqdm import tqdm
from app.services.evaluator import compute_recall,compute_precision,compute_average_precision,compute_precision_at_k,compute_mrr
from app.services.tfidf_service import VectorSpaceModel
import os


dataset_name1='nano-beir/arguana'
dataset_name2='beir/webis-touche2020/v2'
dataset_name3='beir/quora/test'
dataset_name4='antique/test'

datasetname=dataset_name4
name=datasetname.replace("/", "-").replace("\\", "_").strip()

## Load queries and qrels files

In [3]:
if not datasetname:
    raise ValueError("datasetname variable is not defined")

qrels_df = pd.read_csv(f"data/{name}/qrels.tsv", sep="\t", names=["query_id", "doc_id", "relevance"])
# print(qrels_df)
queries_df = pd.read_csv(f"data/{name}/queries.tsv", sep="\t", names=["query_id", "text"])


# make instance from VSM

In [4]:
vsm=VectorSpaceModel(datasetname)
vsm.load()


🔍 Loading TF-IDF vectorizer, matrix, inverted index...


## Evaluation 

In [5]:



all_avg_precisions = []
all_prec_at_10 = []
all_mrr_ranks = []
all_recall = []

for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    query_id = row["query_id"]
    query_text = row["text"]

    # Use your VSM search function
    search_results = vsm.search_tfidf(query_text,top_k=100000)
     # Access the list of result dicts
    result_items = search_results["results"]
    retrieved_docs = [str(doc["doc_id"]) for doc in result_items]

    # Step 2: Convert both retrieved and relevant doc_ids to strings
    relevant_docs = set(qrels_df[qrels_df["query_id"] == query_id]["doc_id"].astype(str))


    # retrieved_docs = [doc["doc_id"] for doc in search_results]

    # Compute metrics
    avg_precision = compute_average_precision(relevant_docs, retrieved_docs)
    prec_at_10 = compute_precision_at_k(relevant_docs, retrieved_docs, k=10)
    recall=compute_recall(relevant_docs, retrieved_docs)

    # Compute rank of first relevant doc for MRR
    rank = 0
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            rank = i + 1
            break

    all_avg_precisions.append(avg_precision)
    all_prec_at_10.append(prec_at_10)
    all_mrr_ranks.append(rank)
    all_recall.append(recall)

# Final scores
map_score = sum(all_avg_precisions) / len(all_avg_precisions)
mean_prec_at_10 = sum(all_prec_at_10) / len(all_prec_at_10)
mrr_score = compute_mrr(all_mrr_ranks)

print(f"MAP: {map_score:.4f}")
print(f"MRR: {mrr_score:.4f}")
print(f"All Recall values: {all_recall}")
print(f"All Precision@10 values: {all_prec_at_10}")


100%|██████████| 200/200 [00:38<00:00,  5.22it/s]

MAP: 0.1834
MRR: 0.6814
All Recall values: [0.9444444444444444, 0.8666666666666667, 0.8260869565217391, 0.7804878048780488, 0.9032258064516129, 0.6388888888888888, 0.8787878787878788, 0.8620689655172413, 0.9032258064516129, 0.3333333333333333, 0.7297297297297297, 0.782608695652174, 0.92, 0.9354838709677419, 1.0, 0.7741935483870968, 0.9565217391304348, 0.6136363636363636, 0.5, 0.9666666666666667, 0.525, 0.7333333333333333, 0.8918918918918919, 0.9230769230769231, 0.9, 0.7352941176470589, 0.7647058823529411, 0.5862068965517241, 0.9117647058823529, 0.825, 0.75, 0.8148148148148148, 0.88, 0.717948717948718, 0.9655172413793104, 0.8928571428571429, 0.7142857142857143, 0.8, 0.6571428571428571, 0.9655172413793104, 0.6428571428571429, 0.9, 1.0, 0.8888888888888888, 0.9166666666666666, 0.8529411764705882, 0.8666666666666667, 0.8285714285714286, 0.8461538461538461, 0.9666666666666667, 0.9666666666666667, 0.8064516129032258, 0.5, 0.8048780487804879, 0.8285714285714286, 0.75, 0.625, 0.7857142857142857




## save result

In [10]:
summary = {
    "Mean Average Precision": map_score,
    "Mean Reciprocal Rank": mrr_score
}
import json

output_dir = os.path.join("results", "TF-IDF", name)
output_path = os.path.join(output_dir,"evaluation_summary.json")

with open(output_path, "w") as f:
    json.dump(summary, f)