## import packages

In [5]:
import pandas as pd
from tqdm import tqdm
from app.services.evaluation.evaluator import compute_map,compute_recall
from app.services.tfidf_service import VectorSpaceModel
import os


dataset_name1='nano-beir/arguana'
dataset_name2='beir/webis-touche2020/v2'
dataset_name3='beir/quora/test'

datasetname=dataset_name3
name=datasetname.replace("/", "-").replace("\\", "_").strip()

## Load queries and qrels files

In [6]:
if not datasetname:
    raise ValueError("datasetname variable is not defined")

qrels_df = pd.read_csv(f"data/{datasetname}/qrels.tsv", sep="\t", names=["query_id", "doc_id", "relevance"])
# print(qrels_df)
queries_df = pd.read_csv(f"data/{datasetname}/queries.tsv", sep="\t", names=["query_id", "text"])


# make instance from VSM

In [7]:
vsm=VectorSpaceModel(datasetname)
vsm.load()


🔍 Loading TF-IDF vectorizer, matrix, inverted index...


## compute MAP

In [None]:
map_scores = []
query_ids = []

for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    query_id = row["query_id"]
    query_text = row["text"]
    
    # Get top_k docs from your system
    search_results = vsm.search_with_inverted_index(query_text, top_k=10000)


    # Access the list of result dicts
    result_items = search_results["results"]
    retrieved_doc_ids = [str(doc["doc_id"]) for doc in result_items]

    map_score = compute_map(retrieved_doc_ids, qrels_df, query_id)
    print(f"\n map_score: {map_score}")
    map_scores.append(map_score)
    query_ids.append(query_id)
    # print(f"\n✅ Mean Average Precision (MAP): {map_score:.4f}")
    map_df = pd.DataFrame({
    "query_id": query_ids,
    "map_score": map_scores
})

output_dir = os.path.join("results", "MAP", "TF-IDF", name)
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, f"{name}_map_scores.csv")
try:
    map_df.to_csv(output_path, index=False)
except Exception as e:
    print(f"Error saving file to {output_path}: {str(e)}")
    raise


# Also print overall MAP
overall_map = sum(map_scores) / len(map_scores)
print(f"\n📈 Overall MAP: {overall_map:.4f}")


## RECALL

In [4]:
recall_scores = []

for i, row in queries_df.iterrows():
    query_id = str(row["query_id"])
    query_text = row["text"]

    # Get ALL retrieved docs (no top_k cutoff)
    results = vsm.search_with_inverted_index(query_text, top_k=10000)  # or async call with await
    retrieved_doc = results["matched_count"]
    print(f"retrieved_doc: {retrieved_doc}")

    recall = compute_recall(retrieved_doc, qrels_df, query_id)
    print(f"recall: {recall}")
    recall_scores.append({"query_id": query_id, "recall": recall})



recall_df = pd.DataFrame(recall_scores)
output_dir = f"results/Recall/TF-IDF"
os.makedirs(output_dir, exist_ok=True)
recall_df.to_csv(f"{output_dir}/{name}_recall_scores.csv", index=False)


🔍 Searching with inverted index...
🔍 Loading TF-IDF vectorizer, matrix, inverted index...
🔍 Building query inverted index...
retrieved_doc: 9681
recall: 0.0
🔍 Searching with inverted index...
🔍 Loading TF-IDF vectorizer, matrix, inverted index...
🔍 Building query inverted index...
retrieved_doc: 1239
recall: 0.0
🔍 Searching with inverted index...
🔍 Loading TF-IDF vectorizer, matrix, inverted index...
🔍 Building query inverted index...
retrieved_doc: 32426
recall: 0.0
🔍 Searching with inverted index...
🔍 Loading TF-IDF vectorizer, matrix, inverted index...
🔍 Building query inverted index...
retrieved_doc: 11913
recall: 0.0
🔍 Searching with inverted index...
🔍 Loading TF-IDF vectorizer, matrix, inverted index...
🔍 Building query inverted index...
retrieved_doc: 5813
recall: 0.0
🔍 Searching with inverted index...
🔍 Loading TF-IDF vectorizer, matrix, inverted index...
🔍 Building query inverted index...


KeyboardInterrupt: 