# Part 2: Indexing and Evaluation

In [16]:
import os
import pandas as pd
import math
from collections import Counter, defaultdict
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download("punkt")
nltk.download("stopwords")
import IPython.display 
IPython.display.clear_output()
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 100)

## Part I: Indexing

1. Build inverted index: After having pre-processed the data, you can then create the 
inverted index.

In [17]:
input_directory = "../IRWA-2025-data/"
input_name = "fashion_products_dataset_processed.csv"
input_path = os.path.join(input_directory, input_name)
df_fashion_products = pd.read_csv(input_path, sep=",")

print(df_fashion_products.sample(5)["document"])

10696    fit true brown blend women cotton slim bottomwear bl solid western cloth casual kapsonsretailpvt...
17258    mc colour home waist elast cotton solid bottomwear life cloth trackpant pocket white hand wear m...
21818    natur color cloth design arbo wear properli stripe polycotton regular new fit sdm arbor india ha...
10356    true polyest blend women print nehru bl wear sleeveless cloth western jacket kapsonsretailpvtltd...
1754       accessori polyest solid dark bottomwear reeb track cloth adi india pvt pant men blue sport fs ltd
Name: document, dtype: object


In [4]:
# toy dataset
data = {
    "pid": [1, 2, 3],
    "doc_text": [
        "nike running shoes",
        "adidas running jacket",
        "nike jacket blue"
    ]
}

df = pd.DataFrame(data)
print(df)

   pid               doc_text
0    1     nike running shoes
1    2  adidas running jacket
2    3       nike jacket blue


In [5]:
# invert index
def build_inverted_index(df):
    index = {}

    for _, row in df.iterrows():
        doc_id = row["pid"]
        terms = row["doc_text"].split() # tokenize
        for term in set(terms): # duplicates
            if term not in index:
                index[term] = [doc_id]
            else:
                index[term].append(doc_id)

    return index

In [6]:
index = build_inverted_index(df)

for term, docs in index.items():
    print(f"{term}: {docs}")

running: [1, 2]
shoes: [1]
nike: [1, 3]
adidas: [2]
jacket: [2, 3]
blue: [3]


In [7]:
# search
def search_and(query, index):
    terms = build_terms(query)
    result = None

    for term in terms:
        if term in index:
            docs = set(index[term])
            result = docs if result is None else result & docs
        else:
            return []

    return sorted(result)

2. Propose test queries: Define five queries that will be used to evaluate your search 
engine (Be creative).

In [8]:
queries = [
    "nike running shoes",
    "adidas jacket",
    "women cotton dress",
    "black leather bag",
    "discount sport shoes"
]

3. Rank your results: Implement the TF-IDF algorithm and provide ranking-based results.

$$tf_{t,d} = \dfrac{N_{t,d}}{||D||}\tag{1}, \qquad idf_t = log\dfrac{N}{df_t} \qquad w_{t,d}=tf_{t,d}\cdot idf_t$$

In [9]:
# weights
def build_idf(df): # idf
    N = len(df) # total number of docs in a collection
    idf = {} # dict to save idf(t)

    for text in df["doc_text"]:
        words = set(text.split())
        for w in words:
            idf[w] = idf.get(w, 0) + 1

    for w in idf: # idf(t) = log(N / df(t))
        idf[w] = math.log(N / idf[w])

    return idf

def build_tfidf(df, idf): # tf-idf
    docs = {}

    for _, row in df.iterrows():
        pid = row["pid"]
        terms = build_terms(row["doc_text"])
        tf = Counter(terms)
        L = len(terms) or 1
        vec = {t: (tf[t]/L) * idf.get(t, 0.0) for t in tf}
        norm = math.sqrt(sum(v*v for v in vec.values())) or 1.0
        docs[pid] = {t: v/norm for t, v in vec.items()}

    return docs

In [10]:
# ranking
def query_vector(query, idf):
    terms = build_terms(query)
    tf = Counter(terms)
    L = len(terms) or 1
    vec = {t: (tf[t]/L) * idf.get(t, 0.0) for t in tf}
    norm = math.sqrt(sum(v*v for v in vec.values())) or 1.0

    return {t: v/norm for t, v in vec.items()}

def score_cosine(doc_vec, q_vec): # cosine similarity
    s = 0.0

    for t, wq in q_vec.items():
        wd = doc_vec.get(t)
        if wd:
            s += wd * wq

    return s

def rank_tfidf(docs, idf, query):
    qv = query_vector(query, idf)
    scores = [(pid, score_cosine(vec, qv)) for pid, vec in docs.items()]
    scores.sort(key=lambda x: x[1], reverse=True)

    return [(pid, round(s, 4)) for pid, s in scores if s > 0]

In [25]:
def print_rankings(df, queries):
    if isinstance(queries, str):
        queries = [queries]
    idf  = build_idf(df)
    docs = build_tfidf(df, idf)

    for qi, q in enumerate(queries, 1):
        res = rank_tfidf(docs, idf, q)
        print(f"\nquery={qi}: {q}")
        if not res:
            print("no results")
            continue
        for i, (pid, score) in enumerate(res, 1):
            text = df.loc[df["pid"] == pid, "doc_text"].values[0]
            print(f"{i}. document={pid} | score={score:.4f} | text={text}")

In [20]:
df

Unnamed: 0,pid,doc_text
0,1,nike running shoes
1,2,adidas running jacket
2,3,nike jacket blue


In [26]:
print_rankings(df, queries)


query=1: nike running shoes
1. document=1 | score=1.0000 | text=nike running shoes
2. document=3 | score=0.3272 | text=nike jacket blue

query=2: adidas jacket
1. document=2 | score=1.0000 | text=adidas running jacket
2. document=3 | score=0.3272 | text=nike jacket blue

query=3: women cotton dress
no results

query=4: black leather bag
no results

query=5: discount sport shoes
no results


## Part II: Evaluation

### 1. Evaluation metrics

- Precision@K (P@K): Of the K items returned, what fraction are relevant?
- Recall@K (R@K): Of all the relevant items that exist, what fraction did I retrieve in the top K?
- Average Precision@K (AP@K): Average of the precisions at the positions where a relevant item appears (up to K).
- F1@K: Harmonic mean between P@K and R@K.
- MAP: Average (over queries) of AP (or AP@K).
- MRR: Average (over queries) of 1/(position of the first relevant item).
- NDCG@K: Higher scores are given to those who place relevant items higher in the ranking. DCG normalized by the maximum possible (IDCG).

In [13]:
# p@k
def precision_at_k(results, relevant_docs, k):
  if k == 0:
      return 0.0
  results_at_k = results[:k]
  # Count how many of the top K results are in the relevant set
  num_relevant_in_k = len([doc for doc in results_at_k if doc in relevant_docs])
  return num_relevant_in_k / k

In [14]:
# r@k
def recall_at_k(results, relevant_docs, k):
  total_relevant = len(relevant_docs)
  if total_relevant == 0:
      return 0.0  # Recall is 0 if there are no relevant docs

  results_at_k = results[:k]
  # Count how many of the top K results are in the relevant set
  num_relevant_in_k = len([doc for doc in results_at_k if doc in relevant_docs])
  return num_relevant_in_k / total_relevant

In [15]:
# AP
def average_precision_at_k(results, relevant_docs, k):
  relevant_docs_set = set(relevant_docs)
  total_relevant = len(relevant_docs_set)
  if total_relevant == 0:
      return 0.0

  results_at_k = results[:k]
  sum_of_precisions = 0.0
  num_relevant_found = 0

  for i, doc_id in enumerate(results_at_k, 1):
      if doc_id in relevant_docs_set:
          num_relevant_found += 1
          precision_at_i = num_relevant_found / i
          sum_of_precisions += precision_at_i

  # Denominator is the total number of relevant documents
  return sum_of_precisions / total_relevant

In [16]:
# F1@k
def f1_at_k(results, relevant_docs, k):
  p = precision_at_k(results, relevant_docs, k)
  r = recall_at_k(results, relevant_docs, k)

  if (p + r) == 0:
      return 0.0

  return 2 * (p * r) / (p + r)

In [17]:
# MAP@k
def map_at_k(all_results, all_relevant_docs, k):
  if not all_results:
      return 0.0

  num_queries = len(all_results)
  sum_of_ap = 0.0

  for results, relevant_docs in zip(all_results, all_relevant_docs):
      sum_of_ap += average_precision_at_k(results, relevant_docs, k)

  return sum_of_ap / num_queries

In [18]:
# MRR
def MRR_at_k(all_results, all_relevant_docs, k):
  if not all_results:
      return 0.0

  num_queries = len(all_results)
  sum_of_rr = 0.0

  for results, relevant_docs in zip(all_results, all_relevant_docs):
      relevant_docs_set = set(relevant_docs)
      results_at_k = results[:k]
      for i, doc_id in enumerate(results_at_k, 1):
          if doc_id in relevant_docs_set:
              sum_of_rr += (1.0 / i)
              break  # Found the first relevant item, move to next query

  return sum_of_rr / num_queries

In [19]:
# nDCG@K
def ndcg_at_k(results, relevant_docs, k):
  relevant_docs_set = set(relevant_docs)
  results_at_k = results[:k]

  # Calculate DCG@k
  dcg = 0.0
  for i, doc_id in enumerate(results_at_k, 1):
      if doc_id in relevant_docs_set:
          gain = 1.0
          dcg += gain / math.log2(i + 1) # discount factor

  # Calculate IDCG@K
  idcg = 0.0
  # Ideal ranking has all relevant docs first, up to k
  num_relevant_in_ideal_list = min(len(relevant_docs_set), k)
  for i in range(1, num_relevant_in_ideal_list + 1):
      gain = 1.0
      idcg += gain / math.log2(i + 1)

  if idcg == 0:
      return 0.0  # No relevant docs, or k=0

  return dcg / idcg