## Setup

## Import Libraries

In [103]:
import numpy as np
import pandas as pd

from typing import List, Dict, Tuple, Optional,  Sequence
from tqdm import tqdm
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

### Load and Sample the Dataset

In [104]:
collection_df = pd.read_csv("common_dataset_80k.tsv", sep="\t", header=None, names=["pid", "text"], dtype={"pid": str, "text": str})
collection_df.head(10)

Unnamed: 0,pid,text
0,448,A postal code (also known locally in various E...
1,466,"Therefore, all pathologists must have complete..."
2,646,Obesity is a complex disorder involving an exc...
3,1212,Which president appointed FBI Director James C...
4,1213,"Comey was confirmed by the Senate on July 29, ..."
5,1816,An aneurysm results from weakness in the tissu...
6,2147,Broken or bruised ribs. Introduction. Broken (...
7,3653,United States Office of War Information. The U...
8,3760,Your best bet is to get the biggest computer m...
9,3809,An adiabatic process is any process occurring ...


In [105]:
queries_df = pd.read_csv("queries.dev.tsv", sep="\t", names=["qid", "query"], dtype={"qid": str, "query": str})
queries_df.head(10)

Unnamed: 0,qid,query
0,1048578,cost of endless pools/swim spa
1,1048579,what is pcnt
2,1048580,what is pcb waste
3,1048581,what is pbis?
4,1048582,what is paysky
5,1048583,what is paydata
6,1048584,what is pay range for warehouse specialist in ...
7,1048585,what is paula deen's brother
8,1048586,what is paul gum disease
9,1048587,what is patron


## Methodology

### TD-IDF

#### Build TD-IDF Model

In [106]:
def build_tfidf_index(
    texts: List[str],
    stopwords: Optional[str] = "english",
    ngram_range: Tuple[int, int] = (1, 2),  # unigrams + bigrams
    min_df: int | float = 2,
    max_df: int | float = 0.8,
    max_features: Optional[int] = None,
    sublinear_tf: bool = True,
    use_idf: bool = True,
    norm: Optional[str] = "l2",
) -> Tuple[TfidfVectorizer, sparse.csr_matrix]:
    """Build and fit a TF-IDF vectorizer.

    Parameters
    ----------
    texts : list[str]
        Raw document texts aligned with pid order.
    stopwords : str | None
        "english" for built-in list or None to keep all tokens.
    ngram_range : (int,int)
        Token n-gram span.
    min_df : int | float
        Keep terms in at least this many docs (int) or proportion (float).
    max_df : int | float
        Drop terms in more than this many docs / proportion.
    max_features : int | None
        Cap vocabulary size (most frequent terms kept) if set.
    sublinear_tf : bool
        Apply 1 + log(tf) scaling (can help long docs dominate less).
    use_idf : bool
        If False, this degenerates to normalized term frequency.
    norm : str | None
        Normalization applied to rows.
    """
    vectorizer = TfidfVectorizer(
        input="content",
        lowercase=True,
        strip_accents="unicode",
        stop_words=stopwords,
        token_pattern=r"(?u)\b\w\w+\b",
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        dtype=np.float32,
        norm=norm,
        sublinear_tf=sublinear_tf,
        use_idf=use_idf,
    )
    tfidf = vectorizer.fit_transform(texts)
    return vectorizer, tfidf.tocsr()

In [107]:
print(f"Building TF‑IDF (stopwords='English')...")
vectorizer, doc_matrix = build_tfidf_index(collection_df['text'])
print(f"TF‑IDF matrix shape: {doc_matrix.shape} (docs x terms)")

Building TF‑IDF (stopwords='English')...


TF‑IDF matrix shape: (80000, 275959) (docs x terms)


In [108]:
def retrieve_topN_for_queries(
    vectorizer: TfidfVectorizer,
    doc_matrix: sparse.csr_matrix,
    doc_ids: Sequence[str],
    queries_df: pd.DataFrame,
    topN: int = 10
) -> Dict[str, List[Tuple[str, float]]]:
    """Return topN doc ids per query using cosine similarity (dot product on L2-normalized rows).

    Parameters
    ----------
    topN : int
        Number of documents to keep per query.
    """
    doc_ids = list(map(str, list(doc_ids)))
    qids = queries_df["qid"].astype(str).values
    qtexts = queries_df["query"].astype(str).values

    results: Dict[str, List[Tuple[str, float]]] = {}
    for qid, qtext in tqdm(zip(qids, qtexts), total=len(qids), desc="Retrieving"):
        qvec = vectorizer.transform([qtext])            # (1 x V)
        scores = (doc_matrix @ qvec.T).toarray().ravel() # (D,)

        if scores.size <= topN:
            idx = np.argsort(-scores)
        else:
            idx_part = np.argpartition(-scores, topN-1)[:topN]
            idx = idx_part[np.argsort(-scores[idx_part])]
        ranked = [(doc_ids[i], float(scores[i])) for i in idx]
        results[qid] = ranked[:topN]

    return results

#### Metric for TF-IDF

In [109]:
from eval_metrics import ndcg_at_k, average_precision_at_k, recall_at_k

def evaluate_run_at_k_df(
    run: Dict[str, List[Tuple[str, float]]],   # {qid: [(pid, score), ...]}
    qrels_df: pd.DataFrame,                    # columns: qid, pid, rel
    k: int
) -> dict:
    """Macro nDCG@k / MAP@k / Recall@k over queries present in BOTH run & qrels.
    Missing queries in run are ignored (common IR eval convention); to penalize omissions,
    explicitly inject empty lists before calling.
    """
    # Build relevance dicts
    rel_dict_by_q: Dict[str, Dict[str, int]] = {}
    rel_set_by_q: Dict[str, set] = {}
    for qid, g in qrels_df.groupby("qid", sort=False):
        rel_dict = dict(zip(g["pid"], g["rel"]))
        rel_dict_by_q[qid] = rel_dict
        rel_set_by_q[qid] = {pid for pid, rel in rel_dict.items() if rel > 0}

    eval_qids = [qid for qid in run.keys() if qid in rel_dict_by_q]

    ndcg_vals, map_vals, rcl_vals = [], [], []
    for qid in eval_qids:
        ranked_pids = [pid for pid, _ in run[qid]]
        ndcg_vals.append(ndcg_at_k(ranked_pids, rel_dict_by_q[qid], k=k))
        map_vals.append(average_precision_at_k(ranked_pids, rel_set_by_q[qid], k=k))
        rcl_vals.append(recall_at_k(ranked_pids, rel_set_by_q[qid], k=k))

    return {
        f"ndcg@{k}": float(np.mean(ndcg_vals)),
        f"map@{k}": float(np.mean(map_vals)),
        f"recall@{k}": float(np.mean(rcl_vals)),
        "num_queries": len(eval_qids)
    }

In [112]:
sampled_queries = pd.read_csv("sampled_queries_1k.tsv", sep="\t", names=["qid", "query"], dtype={"qid": str, "query": str})
qrels = pd.read_csv("qrels_for_eval.tsv", sep="\t", dtype={"qid":str,"pid":str,"rel":int})

run_sub = retrieve_topN_for_queries(
    vectorizer, doc_matrix, collection_df["pid"], sampled_queries, topN=10
)

k=10
metrics = evaluate_run_at_k_df(run_sub, qrels_df=qrels, k=k)

print("\n=== Results (subset-consistent) ===")
print(f"ndcg@{k}:   {metrics[f'ndcg@{k}']:.4f}")
print(f"map@{k}:    {metrics[f'map@{k}']:.4f}")
print(f"recall@{k}: {metrics[f'recall@{k}']:.4f}")



Retrieving:   1%|          | 7/1001 [00:00<00:14, 68.64it/s]

Retrieving: 100%|██████████| 1001/1001 [00:12<00:00, 78.89it/s]



=== Results (subset-consistent) ===
ndcg@10:   0.5588
map@10:    0.5062
recall@10: 0.7188
