<a href="https://colab.research.google.com/github/vinupk/Information-Retrieval/blob/main/CA6005_Information_retrieval_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install xmltodict
! pip install pytrec_eval

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0
Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308109 sha256=95f744adf42ee2773ecc9175c34cd5b02b9d565cb5e26aa25e2dd33fbef15e67
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


In [None]:
import xml.etree.ElementTree as ET
import re
import xmltodict
import string
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from math import log
from pytrec_eval import RelevanceEvaluator
import pandas as pd
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## **Read Cranfield XML file**

In [None]:
def read_cranfield_xml(xml_file):
  with open(xml_file) as f:
      xml = f.read()
  _dict = xmltodict.parse(xml, attr_prefix="")
  return _dict

# **Read text file**

In [None]:
def get_documents_collection(doc):
  for root_key, root_value in doc.items():
    if root_key == "root" and isinstance(root_value, dict):
            for doc_key, doc_value in root_value.items():
                if doc_key == "doc" and isinstance(doc_value, list):
                  dids, docs = [item['docno'] for item in doc_value], [str(item['title']) + " " + str(item['text'])  for item in doc_value]
    return dids,docs

In [None]:
def get_query_collection(doc):
  for root_key, root_value in doc.items():
    if root_key == "xml" and isinstance(root_value, dict):
            for doc_key, doc_value in root_value.items():
                if doc_key == "top" and isinstance(doc_value, list):
                  dids, docs = [item['num'] for item in doc_value], [str(item['title'])  for item in doc_value]
    return dids,docs

In [None]:
def read_qrels_file(file_path):
    ground_truth = defaultdict(dict)
    with open(file_path, 'r') as file:
        for line in file:
            query_id, _, doc_id, relevance = line.strip().split()
            ground_truth[query_id][doc_id] = int(relevance)
    return ground_truth

# **Preprocessing text**

In [None]:
def preprocess_text(docs):
  #Step 1: join text
  text = ' '.join(map(str,docs))

  #Step 2: Normalize to lowercase
  text = text.lower()

  #Step 3: Tokanize
  tokens = word_tokenize(text)

  # remove punctuation from each word
  punc = str.maketrans('', '', string.punctuation)
  non_punc = [w.translate(punc) for w in tokens]

  #Step 4:  remove non alphabetic tokens
  words = [word for word in non_punc if word.isalpha()]

  #Step 5: remove stop words from tokens
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]

  #Step 6: stemming of tokens
  stemmer = PorterStemmer()
  stemmed_text = [stemmer.stem(word) for word in words]

  #Step 7: Lemma from stemmed tokens
  lemmatizer = WordNetLemmatizer()
  lemma = [lemmatizer.lemmatize(stem) for stem in stemmed_text]

  return lemma

In [None]:
def calc_term_freequency(tokens):
    tf_score = defaultdict(int)
    for token in tokens:
        tf_score[token] = tokens.count(token)
    return tf_score

In [None]:
def build_inverted_index(dids, documents):
    inverted_index = defaultdict(dict)
    doc_lengths = {}
    for doc_id, doc_text in zip(dids, documents):
        tokens = preprocess_text(doc_text)
        doc_lengths[doc_id] = len(tokens)

        #Find Term freequency
        term_freqs = defaultdict(int)
        term_freqs = calc_term_freequency(tokens)

        #Find inverted term freequency
        for term, freq in term_freqs.items():
            inverted_index[term][doc_id] = freq

    return inverted_index, doc_lengths

In [None]:
# Function to calculate inverse document frequency (IDF)
def calculate_tf_idf(tf, df, num_docs):
    return (1 + log(tf)) * log(num_docs / df)

In [None]:
def build_tf_idf_index(inverted_index, doc_lengths, num_docs):
    tf_idf_index = defaultdict(dict)

    for term, postings in inverted_index.items():
        df = len(postings)
        for doc_id, tf in postings.items():
            tf_idf_index[term][doc_id] = calculate_tf_idf(tf, df, num_docs) / doc_lengths[doc_id]

    return tf_idf_index

In [None]:
def query_processing_VSM(query, inverted_index, tf_idf_index, doc_lengths, num_docs):
    query_terms = preprocess_text(query)
    query_vector = defaultdict(int)

    for term in query_terms:
        query_vector[term] += 1

    scores = defaultdict(float)

    for term, query_tf in query_vector.items():
        if term in inverted_index:
            for doc_id, doc_tf_idf in tf_idf_index[term].items():
                scores[doc_id] += query_tf * doc_tf_idf

    # Normalize scores by document length
    for doc_id, score in scores.items():
        scores[doc_id] = score / doc_lengths[doc_id]

    # Rank documents based on scores
    ranked_docs = defaultdict(float, {doc_id: score for doc_id, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)})

    return ranked_docs

In [None]:
def calculate_bm25(tf, df, doc_length, avg_doc_length,num_docs, k1, b):
    # BM25 parameters
    k1 = k1
    b = b

    # Calculate IDF
    idf = log((num_docs - df + 0.5) / (df + 0.5) + 1.0)

    # Calculate BM25 score
    bm25 = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_length / avg_doc_length)))

    return bm25

In [None]:
def build_bm25_index(inverted_index, doc_lengths, num_docs, k1=1.5, b=0.75):
    bm25_index = defaultdict(dict)
    avg_doc_length = sum(doc_lengths.values())/num_docs
    for term, postings in inverted_index.items():
        df = len(postings)
        for doc_id, tf in postings.items():
            doc_length = doc_lengths[doc_id]
            bm25_index[term][doc_id] = calculate_bm25(tf, df, doc_length, avg_doc_length ,num_docs, k1, b)

    return bm25_index

In [None]:
def query_processing_bm25(query, inverted_index, doc_lengths, num_docs):
    query_terms = preprocess_text(query)
    scores = defaultdict(float)
    bm25_index = build_bm25_index(inverted_index, doc_lengths, num_docs)
    for term in query_terms:
        if term in inverted_index:
            for doc_id, doc_bm25 in bm25_index[term].items():
                scores[doc_id] += doc_bm25

    # Rank documents based on scores
    ranked_docs = defaultdict(float, {doc_id: score for doc_id, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)})

    return ranked_docs

 Language Model with Dirichlet Smoothing as another language model for ranking the documents. The Dirichlet Smoothing technique is commonly used to address the issue of unseen terms in the query-document retrieval process

In [None]:
def calculate_language_model_prob(term, doc_id, inverted_index, doc_lengths, mu=1000):
    # Calculate the probability of the term in the language model using Dirichlet Smoothing
    total_terms = sum(inverted_index[term].values())
    doc_length = doc_lengths[doc_id]
    background_prob = total_terms / sum(doc_lengths.values())
    term_freq = inverted_index[term].get(doc_id, 0)

    return (term_freq + mu * background_prob) / (doc_length + mu)

In [None]:
def query_processing_language_model(query, inverted_index, doc_lengths):
    query_terms = preprocess_text(query)
    scores = defaultdict(float)

    for doc_id in doc_lengths:
        score = 1.0
        for term in query_terms:
            prob = calculate_language_model_prob(term, doc_id, inverted_index, doc_lengths)
            score *= prob
        scores[doc_id] = score

    # Rank documents based on scores
    ranked_docs = defaultdict(float, {doc_id: score for doc_id, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)})

    return ranked_docs

**Step 1: Replace Symbols and Whitespaces**



In [36]:
#Read Document cranfield file
cranfield_collection_path = '/content/cranfield/cran.all.1400.xml'
documents = read_cranfield_xml(cranfield_collection_path)
dids, docs  = get_documents_collection(documents)

# Build the inverted index and calculate document lengths
inverted_index, doc_lengths = build_inverted_index(dids, docs)

# Calculate the number of documents
num_docs = len(documents)

#Read query file
cranfield_collection_path = '/content/cranfield/cran.qry.xml'
query = read_cranfield_xml(cranfield_collection_path)
qids, querys = get_query_collection(query)

# Build the TF-IDF VSM index
tf_idf_index = build_tf_idf_index(inverted_index, doc_lengths, num_docs)

qrels_file_path = '/content/cranfield/cranqrel.trec.txt'  # Replace with the actual path to the qrels.text file
ground_truth = read_qrels_file(qrels_file_path)

# Update the data type of the collections to defaultdict(dict)
ranked_docs_vsm_collection = defaultdict(dict)
ranked_docs_bm25_collection = defaultdict(dict)
ranked_docs_language_model_collection = defaultdict(dict)

for qid, query in zip(qids, querys):
    # Ranking using Vector Space Model
    ranked_docs_vsm_collection[qid] = query_processing_VSM(query, inverted_index, tf_idf_index, doc_lengths, num_docs)

    # Process the query and get ranked documents using BM25
    ranked_docs_bm25_collection[qid] = query_processing_bm25(query, inverted_index, doc_lengths, num_docs)

    # Process the query and get ranked documents using the Language Model with Dirichlet Smoothing
    ranked_docs_language_model_collection[qid] = query_processing_language_model(query, inverted_index, doc_lengths)

# Convert the ranked documents to the required format for evaluation
ranked_docs_vsm_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_vsm_collection[qid].items(), 1)} for qid in ranked_docs_vsm_collection}
ranked_docs_bm25_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_bm25_collection[qid].items(), 1)} for qid in ranked_docs_bm25_collection}
ranked_docs_language_model_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_language_model_collection[qid].items(), 1)} for qid in ranked_docs_language_model_collection}

# Prepare the ground truth data for evaluation
qrels = {}
for qid, tuples in ground_truth.items():
    qrels[qid] = {}
    for doc_id, relevance in tuples.items():
        qrels[qid][doc_id] = relevance

# Create the evaluator instance
evaluator = RelevanceEvaluator(qrels, {'map', 'P_5', 'ndcg'})

# Evaluate the ranking models
trec_eval_results_vsm = evaluator.evaluate(ranked_docs_vsm_trec)
trec_eval_results_bm25 = evaluator.evaluate(ranked_docs_bm25_trec)
trec_eval_results_language_model = evaluator.evaluate(ranked_docs_language_model_trec)

# Print the evaluation results
print("TREC Evaluation Results:")
print("+---------+---------+---------+---------+-----------------------+")
print("| Model   | QueryID | DocID   | Rank    | Similarity (Score)    |")
print("+---------+---------+---------+---------+-----------------------+")

# Write the evaluation results to a file
with open("evaluation_results.txt", "w") as f:
    f.write("TREC Evaluation Results:\n")
    f.write("+---------+---------+---------+---------+-----------------------+\n")
    f.write("| Model   | QueryID | DocID   | Rank    | Similarity (Score)    |\n")
    f.write("+---------+---------+---------+---------+-----------------------+\n")

    for query_id in trec_eval_results_vsm.keys():
        for doc_id, rank in ranked_docs_vsm_trec[query_id].items():
            similarity = trec_eval_results_vsm[query_id]['P_5']
            line = f"| VSM     | Q{query_id}  | {doc_id}  | {rank}  | {similarity:0.4f}          |\n"
            print(line, end="")
            f.write(line)

    for query_id in trec_eval_results_bm25.keys():
        for doc_id, rank in ranked_docs_bm25_trec[query_id].items():
            similarity = trec_eval_results_bm25[query_id]['P_5']
            line = f"| BM25    | Q{query_id}  | {doc_id}  | {rank}  | {similarity:0.4f}          |\n"
            print(line, end="")
            f.write(line)

    for query_id in trec_eval_results_language_model.keys():
        for doc_id, rank in ranked_docs_language_model_trec[query_id].items():
            similarity = trec_eval_results_language_model[query_id]['P_5']
            line = f"| Language| Q{query_id}  | {doc_id}  | {rank}  | {similarity:0.4f}          |\n"
            print(line, end="")
            f.write(line)

    f.write("+---------+---------+---------+---------+-----------------------+\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| Language| Q219  | 154  | 601  | 0.0000          |
| Language| Q219  | 771  | 602  | 0.0000          |
| Language| Q219  | 954  | 603  | 0.0000          |
| Language| Q219  | 754  | 604  | 0.0000          |
| Language| Q219  | 832  | 605  | 0.0000          |
| Language| Q219  | 484  | 606  | 0.0000          |
| Language| Q219  | 310  | 607  | 0.0000          |
| Language| Q219  | 901  | 608  | 0.0000          |
| Language| Q219  | 334  | 609  | 0.0000          |
| Language| Q219  | 1005  | 610  | 0.0000          |
| Language| Q219  | 892  | 611  | 0.0000          |
| Language| Q219  | 1220  | 612  | 0.0000          |
| Language| Q219  | 699  | 613  | 0.0000          |
| Language| Q219  | 883  | 614  | 0.0000          |
| Language| Q219  | 815  | 615  | 0.0000          |
| Language| Q219  | 1026  | 616  | 0.0000          |
| Language| Q219  | 444  | 617  | 0.0000          |
| Language| Q219  | 466  | 618  | 0.0000        

In [None]:
# Update the data type of the collections to defaultdict(dict)
ranked_docs_vsm_collection = defaultdict(dict)
ranked_docs_bm25_collection = defaultdict(dict)
ranked_docs_language_model_collection = defaultdict(dict)

for qid, query in zip(qids, querys):
    # Ranking using Vector Space Model
    ranked_docs_vsm_collection[qid] = query_processing_VSM(query, inverted_index, tf_idf_index, doc_lengths, num_docs)

    # Process the query and get ranked documents using BM25
    ranked_docs_bm25_collection[qid] = query_processing_bm25(query, inverted_index, doc_lengths, num_docs)

    # Process the query and get ranked documents using the Language Model with Dirichlet Smoothing
    ranked_docs_language_model_collection[qid] = query_processing_language_model(query, inverted_index, doc_lengths)

# Convert the ranked documents to the required format for evaluation
ranked_docs_vsm_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_vsm_collection[qid].items(), 1)} for qid in ranked_docs_vsm_collection}
ranked_docs_bm25_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_bm25_collection[qid].items(), 1)} for qid in ranked_docs_bm25_collection}
ranked_docs_language_model_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_language_model_collection[qid].items(), 1)} for qid in ranked_docs_language_model_collection}

# Prepare the ground truth data for evaluation
qrels = {}
for qid, tuples in ground_truth.items():
    qrels[qid] = {}
    for doc_id, relevance in tuples.items():
        qrels[qid][doc_id] = relevance

# Create the evaluator instance
evaluator = RelevanceEvaluator(qrels, {'map', 'P_5', 'ndcg'})

# Evaluate the ranking models
trec_eval_results_vsm = evaluator.evaluate(ranked_docs_vsm_trec)
trec_eval_results_bm25 = evaluator.evaluate(ranked_docs_bm25_trec)
trec_eval_results_language_model = evaluator.evaluate(ranked_docs_language_model_trec)

# Print the evaluation results
print("TREC Evaluation Results:")
print("+------------+---------+---------+---------+")
print("|  Model     |  P@5    |  NDCG@5 |  MAP    |")
print("+------------+---------+---------+---------+")
for query_id in trec_eval_results_vsm.keys():
    print(f"|  VSM (Q{query_id})  |  {trec_eval_results_vsm[query_id]['P_5']:0.4f}  |  {trec_eval_results_vsm[query_id]['ndcg']:0.4f}  |  {trec_eval_results_vsm[query_id]['map']:0.4f}  |")
    print(f"|  BM25 (Q{query_id}) |  {trec_eval_results_bm25[query_id]['P_5']:0.4f}  |  {trec_eval_results_bm25[query_id]['ndcg']:0.4f}  |  {trec_eval_results_bm25[query_id]['map']:0.4f}  |")
    print(f"|  Language (Q{query_id}) |  {trec_eval_results_language_model[query_id]['P_5']:0.4f}  |  {trec_eval_results_language_model[query_id]['ndcg']:0.4f}  |  {trec_eval_results_language_model[query_id]['map']:0.4f}  |")

In [None]:
ranked_docs_vsm_collection

In [None]:
for query_id, documents in ranked_docs_vsm_collection.items():
  print(f"Query ID: {query_id}")
  for doc_id, rank in documents.items():
    print(f"Document ID: {doc_id}, Rank: {rank}")

In [None]:
for query_id, documents in ground_truth.items():
    print(f"Query ID: {query_id}")
    for doc_id, relevance in documents.items():
        print(f"Document ID: {doc_id}, Relevance: {relevance}")

In [30]:
# Prepare the ground truth data for evaluation
qrels = {}
for qid, tuples in ground_truth.items():
    qrels[qid] = {}
    for doc_id, relevance in tuples.items():
        qrels[qid][doc_id] = relevance

In [33]:
# Convert the ranked documents to the required format for evaluation
ranked_docs_vsm_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_vsm_collection[qid].items(), 1)} for qid in ranked_docs_vsm_collection}
ranked_docs_bm25_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_bm25_collection[qid].items(), 1)} for qid in ranked_docs_bm25_collection}
ranked_docs_language_model_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_language_model_collection[qid].items(), 1)} for qid in ranked_docs_language_model_collection}

# Prepare the ground truth data for evaluation
qrels = {}
for qid, tuples in ground_truth.items():
    qrels[qid] = {}
    for doc_id, relevance in tuples.items():
        qrels[qid][doc_id] = relevance

# Create the evaluator instance
evaluator = RelevanceEvaluator(qrels, {'map', 'P_5', 'ndcg'})

# Evaluate the ranking models
trec_eval_results_vsm = evaluator.evaluate(ranked_docs_vsm_trec)
trec_eval_results_bm25 = evaluator.evaluate(ranked_docs_bm25_trec)
trec_eval_results_language_model = evaluator.evaluate(ranked_docs_language_model_trec)

# Print the evaluation results
print("TREC Evaluation Results:")
print("+------------+---------+---------+---------+")
print("|  Model     |  P@5    |  NDCG@5 |  MAP    |")
print("+------------+---------+---------+---------+")
for query_id in trec_eval_results_vsm.keys():
    print(f"|  VSM (Q{query_id})  |  {trec_eval_results_vsm[query_id]['P_5']:0.4f}  |  {trec_eval_results_vsm[query_id]['ndcg']:0.4f}  |  {trec_eval_results_vsm[query_id]['map']:0.4f}  |")
    print(f"|  BM25 (Q{query_id}) |  {trec_eval_results_bm25[query_id]['P_5']:0.4f}  |  {trec_eval_results_bm25[query_id]['ndcg']:0.4f}  |  {trec_eval_results_bm25[query_id]['map']:0.4f}  |")
    print(f"|  Language (Q{query_id}) |  {trec_eval_results_language_model[query_id]['P_5']:0.4f}  |  {trec_eval_results_language_model[query_id]['ndcg']:0.4f}  |  {trec_eval_results_language_model[query_id]['map']:0.4f}  |")

TREC Evaluation Results:
+------------+---------+---------+---------+
|  Model     |  P@5    |  NDCG@5 |  MAP    |
+------------+---------+---------+---------+
|  VSM (Q1)  |  0.0000  |  0.3991  |  0.0341  |
|  BM25 (Q1) |  0.0000  |  0.3653  |  0.0219  |
|  Language (Q1) |  0.0000  |  0.3733  |  0.0239  |
|  VSM (Q2)  |  0.0000  |  0.3396  |  0.0173  |
|  BM25 (Q2) |  0.0000  |  0.3544  |  0.0219  |
|  Language (Q2) |  0.0000  |  0.3353  |  0.0171  |
|  VSM (Q4)  |  0.0000  |  0.1251  |  0.0017  |
|  BM25 (Q4) |  0.0000  |  0.1366  |  0.0029  |
|  Language (Q4) |  0.0000  |  0.1497  |  0.0049  |
|  VSM (Q8)  |  0.0000  |  0.2398  |  0.0071  |
|  BM25 (Q8) |  0.0000  |  0.2456  |  0.0077  |
|  Language (Q8) |  0.0000  |  0.2708  |  0.0132  |
|  VSM (Q9)  |  0.0000  |  0.1941  |  0.0114  |
|  BM25 (Q9) |  0.0000  |  0.1418  |  0.0020  |
|  Language (Q9) |  0.0000  |  0.1702  |  0.0057  |
|  VSM (Q10)  |  0.2000  |  0.3579  |  0.0621  |
|  BM25 (Q10) |  0.0000  |  0.2010  |  0.0038  |
| 

In [None]:


# Convert the ranked documents to the required format for evaluation
ranked_docs_vsm_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_vsm_collection[qid].items(), 1)} for qid in ranked_docs_vsm_collection}
ranked_docs_bm25_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_bm25_collection[qid].items(), 1)} for qid in ranked_docs_bm25_collection}
ranked_docs_language_model_trec = {str(qid): {str(doc_id): rank for rank, (doc_id, _) in enumerate(ranked_docs_language_model_collection[qid].items(), 1)} for qid in ranked_docs_language_model_collection}

In [None]:
# Convert the ranked documents to the required format for evaluation
ranked_docs_vsm_trec = {str(qid): {str(doc_id): {'rank': rank, 'relevance': 1 if doc_id in ground_truth.get(qid, {}) else 0} for rank, (doc_id, _) in enumerate(ranked_docs_vsm_collection[qid], 1)} for qid in ranked_docs_vsm_collection}
ranked_docs_bm25_trec = {str(qid): {str(doc_id): {'rank': rank, 'relevance': 1 if doc_id in ground_truth.get(qid, {}) else 0} for rank, (doc_id, _) in enumerate(ranked_docs_bm25_collection[qid], 1)} for qid in ranked_docs_bm25_collection}
ranked_docs_language_model_trec = {str(qid): {str(doc_id): {'rank': rank, 'relevance': 1 if doc_id in ground_truth.get(qid, {}) else 0} for rank, (doc_id, _) in enumerate(ranked_docs_language_model_collection[qid], 1)} for qid in ranked_docs_language_model_collection}

# Evaluate the ranking models using pytrec_eval
evaluator = RelevanceEvaluator(qrels, {'map', 'P_5', 'ndcg'})
trec_eval_results_vsm = evaluator.evaluate(ranked_docs_vsm_trec)
trec_eval_results_bm25 = evaluator.evaluate(ranked_docs_bm25_trec)
trec_eval_results_language_model = evaluator.evaluate(ranked_docs_language_model_trec)

# Print the evaluation results
print("TREC Evaluation Results:")
print("+------------+---------+---------+---------+")
print("|  Model     |  P@5    |  NDCG@5 |  MAP    |")
print("+------------+---------+---------+---------+")
for query_id in trec_eval_results_vsm.keys():
    print(f"|  VSM (Q{query_id})  |  {trec_eval_results_vsm[query_id]['P_5']:0.4f}  |  {trec_eval_results_vsm[query_id]['ndcg']:0.4f}  |  {trec_eval_results_vsm[query_id]['map']:0.4f}  |")
    print(f"|  BM25 (Q{query_id}) |  {trec_eval_results_bm25[query_id]['P_5']:0.4f}  |  {trec_eval_results_bm25[query_id]['ndcg']:0.4f}  |  {trec_eval_results_bm25[query_id]['map']:0.4f}  |")
    print(f"|  Language (Q{query_id}) |  {trec_eval_results_language_model[query_id]['P_5']:0.4f}  |  {trec_eval_results_language_model[query_id]['ndcg']:0.4f}  |  {trec_eval_results_language_model[query_id]['map']:0.4f}  |")

ValueError: ignored

In [None]:
ranked_dict_vsm = {}
for ranked_docs_vsm in ranked_docs_vsm_collection.items():
  for query_id, (doc_id, _) in enumerate(ranked_docs_vsm, 1):
    ranked_dict_vsm.setdefault(str(query_id), {})[doc_id] = query_id

ValueError: ignored

In [None]:
def evaluate(ground_truth, ranked_docs_vsm_collection, ranked_docs_bm25_collection, ranked_docs_language_model_collection):
  ranked_dict_vsm = {}
  ranked_dict_bm25 = {}
  ranked_dict_language_model = {}
  for ranked_docs_vsm, ranked_docs_bm25, ranked_docs_language_model in zip(ranked_docs_vsm_collection, ranked_docs_bm25_collection, ranked_docs_language_model_collection):
    for query_id, (doc_id, _) in enumerate(ranked_docs_vsm.items(), 1):
      ranked_dict_vsm.setdefault(str(query_id), {})[doc_id] = query_id



In [None]:
# Step 2: Convert ranked_docs to dictionaries for each model
ranked_dict_vsm = {}
ranked_dict_bm25 = {}
ranked_dict_language_model = {}

for query_id, (doc_id, _) in enumerate(ranked_docs_vsm.items(), 1):
    ranked_dict_vsm.setdefault(str(query_id), {})[doc_id] = query_id

for query_id, (doc_id, _) in enumerate(ranked_docs_bm25.items(), 1):
    ranked_dict_bm25.setdefault(str(query_id), {})[doc_id] = query_id

for query_id, (doc_id, _) in enumerate(ranked_docs_language_model.items(), 1):
    ranked_dict_language_model.setdefault(str(query_id), {})[doc_id] = query_id
print(ranked_dict_language_model)

# Step 4: Create TrecEval objects for each model
trec_eval_vsm = RelevanceEvaluator(ground_truth, {'map', 'P_5', 'ndcg'})
trec_eval_bm25 = RelevanceEvaluator(ground_truth, {'map', 'P_5', 'ndcg'})
trec_eval_language_model = RelevanceEvaluator(ground_truth, {'map', 'P_5', 'ndcg'})

trec_eval_results_vsm = trec_eval_vsm.evaluate(ranked_dict_vsm)
trec_eval_results_bm25 = trec_eval_bm25.evaluate(ranked_dict_bm25)
trec_eval_results_language_model = trec_eval_language_model.evaluate(ranked_dict_language_model)

print("\nTREC Evaluation Results:")
print("+------------+---------+---------+---------+")
print("|  Model     |  P@5    |  NDCG@5 |  MAP    |")
print("+------------+---------+---------+---------+")
for query_id in trec_eval_results_vsm.keys():
    print(f"|  VSM (Q{query_id})  |  {trec_eval_results_vsm[query_id]['P_5']:0.4f}  |  {trec_eval_results_vsm[query_id]['ndcg']:0.4f}  |  {trec_eval_results_vsm[query_id]['map']:0.4f}  |")
    print(f"|  BM25 (Q{query_id}) |  {trec_eval_results_bm25[query_id]['P_5']:0.4f}  |  {trec_eval_results_bm25[query_id]['ndcg']:0.4f}  |  {trec_eval_results_bm25[query_id]['map']:0.4f}  |")
    print(f"|  Language (Q{query_id}) |  {trec_eval_results_language_model[query_id]['P_5']:0.4f}  |  {trec_eval_results_language_model[query_id]['ndcg']:0.4f}  |  {trec_eval_results_language_model[query_id]['map']:0.4f}  |")

In [None]:
# Create DataFrames for each model
df_vsm = pd.DataFrame(trec_eval_results_vsm).T
df_bm25 = pd.DataFrame(trec_eval_results_bm25).T
df_language_model = pd.DataFrame(trec_eval_results_language_model).T

# Concatenate DataFrames for all models
df_all_models = pd.concat([df_vsm, df_bm25, df_language_model], keys=['VSM', 'BM25', 'Language Model'])

# Print the formatted table
print("\nTREC Evaluation Results:")
print("+------------+---------+---------+---------+")
print("|  Model     |  P@5    |  NDCG@5 |  MAP    |")
print("+------------+---------+---------+---------+")
for model, row in df_all_models.iterrows():
    print(f"|  {model}  |  {row['P_5']:0.4f}  |  {row['ndcg']:0.4f}  |  {row['map']:0.4f}  |")

**Step 2: Normalize to lowercase**

**Step 3: Tokanize**

# **Get term freequency**

# **Read query file**

# New section