In [1]:
import requests
import re
import math
import collections
import xml.etree.ElementTree as ET
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("wordnet")

# Initialize stopword list and lemmatizer
STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()
STEMMER = PorterStemmer()

# Dataset URLs from GitHub
BASE_URL = "https://raw.githubusercontent.com/oussbenk/cranfield-trec-dataset/main/"
FILES = {
    "documents": "cran.all.1400.xml",
    "queries": "cran.qry.xml",
    "treceval": "cranqrel.trec.txt"
}

# Download dataset files from GitHub
def download_file(filename):
    url = BASE_URL + filename
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename}")

for key, filename in FILES.items():
    download_file(filename)

# File paths
DOCUMENTS_FILE = FILES["documents"]
QUERIES_FILE = FILES["queries"]
RELEVANCE_FILE = FILES["treceval"]
OUTPUT_FILE_VSM = "output_trec_vsm.txt"
OUTPUT_FILE_BM = "output_trec_bm.txt"
OUTPUT_FILE_LM = "output_trec_lm.txt"

# Data structures
documents = {}
queries = {}
inverted_index = collections.defaultdict(dict)
doc_lengths = {}

# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Lower Case
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"<[^>]+>", "", text) # Remove HTML/XML Tags
    text = re.sub(r"\d+", "", text) # Remove Numbers
    words = word_tokenize(text) # Tokenization (Split text into words)
    words = [LEMMATIZER.lemmatize(word) for word in words if word not in STOPWORDS] # Remove Stopwords & Lemmatization
    words = [STEMMER.stem(word) for word in words]  # Stemming
    return " ".join(words)


# Parse Documents
def parse_documents():
    with open(DOCUMENTS_FILE, "r", encoding="utf-8") as f:
        raw_text = f.read()

    docs = re.findall(r"<doc>(.*?)</doc>", raw_text, re.DOTALL)

    for doc in docs:
        doc_id = re.search(r"<docno>\s*(\d+)\s*</docno>", doc)
        text = re.search(r"<text>(.*?)</text>", doc, re.DOTALL)

        if doc_id and text:
            doc_id = int(doc_id.group(1))
            text = preprocess_text(text.group(1))  # Apply preprocessing
            documents[doc_id] = text

    print(f"Parsed {len(documents)} documents.")

# Build Inverted Index
def build_inverted_index():
    print("Building inverted index...")
    global doc_lengths
    for doc_id, text in documents.items():
        words = text.split()
        doc_lengths[doc_id] = len(words)
        for word in words:
            inverted_index[word][doc_id] = inverted_index[word].get(doc_id, 0) + 1
    """print("\nInverted Index:")
    for word, doc_dict in inverted_index.items():
        print(f"'{word}': {dict(doc_dict)}")  """

# Parse Queries
def parse_queries():
    tree = ET.parse(QUERIES_FILE)
    root = tree.getroot()

    for query in root.findall("top"):
        query_id = int(query.find("num").text.strip())
        text = query.find("title").text.strip()
        queries[query_id] = preprocess_text(text)  # Apply preprocessing

    print(f"Parsed {len(queries)} queries.")

# Run Pipeline
parse_documents()
build_inverted_index()
parse_queries()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloaded: cran.all.1400.xml
Downloaded: cran.qry.xml
Downloaded: cranqrel.trec.txt
Parsed 1400 documents.
Building inverted index...
Parsed 225 queries.


In [2]:
# Download and compile trec_eval
!wget -q https://github.com/usnistgov/trec_eval/archive/refs/tags/v9.0.7.tar.gz
!tar -xzf v9.0.7.tar.gz
!cd trec_eval-9.0.7 && make

gcc -g -I.  -Wall -DVERSIONID=\"9.0.7\"  -o trec_eval trec_eval.c formats.c meas_init.c meas_acc.c meas_avg.c meas_print_single.c meas_print_final.c get_qrels.c get_trec_results.c get_prefs.c get_qrels_prefs.c get_qrels_jg.c form_res_rels.c form_res_rels_jg.c form_prefs_counts.c utility_pool.c get_zscores.c convert_zscores.c measures.c  m_map.c m_P.c m_num_q.c m_num_ret.c m_num_rel.c m_num_rel_ret.c m_gm_map.c m_Rprec.c m_recip_rank.c m_bpref.c m_iprec_at_recall.c m_recall.c m_Rprec_mult.c m_utility.c m_11pt_avg.c m_ndcg.c m_ndcg_cut.c m_Rndcg.c m_ndcg_rel.c m_binG.c m_G.c m_rel_P.c m_success.c m_infap.c m_map_cut.c m_gm_bpref.c m_runid.c m_relstring.c m_set_P.c m_set_recall.c m_set_rel_P.c m_set_map.c m_set_F.c m_num_nonrel_judged_ret.c m_prefs_num_prefs_poss.c m_prefs_num_prefs_ful.c m_prefs_num_prefs_ful_ret.c m_prefs_simp.c m_prefs_pair.c m_prefs_avgjg.c m_prefs_avgjg_Rnonrel.c m_prefs_simp_ret.c m_prefs_pair_ret.c m_prefs_avgjg_ret.c m_prefs_avgjg_Rnonrel_ret.c m_prefs_simp_imp.c 

In [6]:
import collections
import math

# Compute TF-IDF (Vector Space Model)
def compute_vsm(query):
    scores = collections.defaultdict(float)
    words = query.split()
    for word in words:
        if word in inverted_index:
            df = len(inverted_index[word])
            idf = math.log(len(documents) / df)
            for doc_id, tf in inverted_index[word].items():
                scores[doc_id] += tf * idf

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# Run retrieval and write results
def run_retrieval():
    with open(OUTPUT_FILE_VSM, "w") as f:
        new_query_id = 1  # Start numbering from 1
        for original_query_id in sorted(queries.keys()):
            query_text = queries[original_query_id]
            vsm_scores = compute_vsm(query_text)[:10]
            write_results(f, new_query_id, "VSM", vsm_scores)
            new_query_id += 1

# Write results in TREC format
def write_results(f, query_id, run_id, scores):
    rank = 1
    for doc_id, score in scores:
        score = 1 if score > 0 else 0  # Convert scores to binary (1 or 0)
        f.write(f"{query_id} 0 {doc_id} {score} 1 {run_id}\n")
        rank += 1

run_retrieval()

# Check output
!head -n 20 output_trec_vsm.txt


1 0 51 1 1 VSM
1 0 874 1 1 VSM
1 0 486 1 1 VSM
1 0 792 1 1 VSM
1 0 329 1 1 VSM
1 0 1144 1 1 VSM
1 0 56 1 1 VSM
1 0 1328 1 1 VSM
1 0 252 1 1 VSM
1 0 359 1 1 VSM
2 0 51 1 1 VSM
2 0 12 1 1 VSM
2 0 792 1 1 VSM
2 0 100 1 1 VSM
2 0 1147 1 1 VSM
2 0 1169 1 1 VSM
2 0 640 1 1 VSM
2 0 746 1 1 VSM
2 0 14 1 1 VSM
2 0 712 1 1 VSM


In [4]:
# Run trec_eval using the relevance file
!trec_eval-9.0.7/trec_eval cranqrel.trec.txt output_trec_vsm.txt

runid                 	all	VSM
num_q                 	all	225
num_ret               	all	2250
num_rel               	all	1612
num_rel_ret           	all	378
map                   	all	0.1202
gm_map                	all	0.0102
Rprec                 	all	0.1690
bpref                 	all	0.2283
recip_rank            	all	0.2938
iprec_at_recall_0.00  	all	0.3498
iprec_at_recall_0.10  	all	0.3312
iprec_at_recall_0.20  	all	0.2657
iprec_at_recall_0.30  	all	0.1948
iprec_at_recall_0.40  	all	0.1359
iprec_at_recall_0.50  	all	0.1178
iprec_at_recall_0.60  	all	0.0615
iprec_at_recall_0.70  	all	0.0460
iprec_at_recall_0.80  	all	0.0264
iprec_at_recall_0.90  	all	0.0264
iprec_at_recall_1.00  	all	0.0264
P_5                   	all	0.1822
P_10                  	all	0.1680
P_15                  	all	0.1120
P_20                  	all	0.0840
P_30                  	all	0.0560
P_100                 	all	0.0168
P_200                 	all	0.0084
P_500                 	all	0.0034
P_1000                	all	

In [7]:
import collections
import math

# Compute BM25
def compute_bm25(query, k1=1.5, b=0.75):
    scores = collections.defaultdict(float)
    words = query.split()
    avg_doc_length = sum(doc_lengths.values()) / len(doc_lengths)

    for word in words:
        if word in inverted_index:
            df = len(inverted_index[word])
            idf = math.log((len(documents) - df + 0.5) / (df + 0.5))
            for doc_id, tf in inverted_index[word].items():
                norm_tf = tf * (k1 + 1) / (tf + k1 * (1 - b + b * doc_lengths[doc_id] / avg_doc_length))
                scores[doc_id] += idf * norm_tf

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# Run retrieval and write results
def run_retrieval():
    with open(OUTPUT_FILE_BM, "w") as f:
        new_query_id = 1
        for original_query_id in sorted(queries.keys()):
            query_text = queries[original_query_id]
            bm25_scores = compute_bm25(query_text)[:20]
            write_results(f, new_query_id, "BM25", bm25_scores)
            new_query_id += 1

# Write results in TREC format
def write_results(f, query_id, run_id, scores):
    rank = 1
    for doc_id, score in scores:
        score = 1 if score > 0 else 0  # Convert scores to binary (1 or 0)
        f.write(f"{query_id} 0 {doc_id} {rank} {score} {run_id}\n")
        rank += 1

run_retrieval()

# Check output
!head -n 20 output_trec_bm.txt


1 0 51 1 1 BM25
1 0 486 2 1 BM25
1 0 878 3 1 BM25
1 0 573 4 1 BM25
1 0 12 5 1 BM25
1 0 746 6 1 BM25
1 0 944 7 1 BM25
1 0 13 8 1 BM25
1 0 879 9 1 BM25
1 0 141 10 1 BM25
1 0 747 11 1 BM25
1 0 1361 12 1 BM25
1 0 172 13 1 BM25
1 0 359 14 1 BM25
1 0 665 15 1 BM25
1 0 663 16 1 BM25
1 0 78 17 1 BM25
1 0 1003 18 1 BM25
1 0 219 19 1 BM25
1 0 184 20 1 BM25


In [8]:
# Run trec_eval using the relevance file
!trec_eval-9.0.7/trec_eval cranqrel.trec.txt output_trec_bm.txt

runid                 	all	BM25
num_q                 	all	225
num_ret               	all	4500
num_rel               	all	1612
num_rel_ret           	all	689
map                   	all	0.1615
gm_map                	all	0.0458
Rprec                 	all	0.1640
bpref                 	all	0.3062
recip_rank            	all	0.2844
iprec_at_recall_0.00  	all	0.3647
iprec_at_recall_0.10  	all	0.3479
iprec_at_recall_0.20  	all	0.3066
iprec_at_recall_0.30  	all	0.2501
iprec_at_recall_0.40  	all	0.2246
iprec_at_recall_0.50  	all	0.1946
iprec_at_recall_0.60  	all	0.1371
iprec_at_recall_0.70  	all	0.1117
iprec_at_recall_0.80  	all	0.0711
iprec_at_recall_0.90  	all	0.0453
iprec_at_recall_1.00  	all	0.0453
P_5                   	all	0.1742
P_10                  	all	0.1787
P_15                  	all	0.1689
P_20                  	all	0.1531
P_30                  	all	0.1021
P_100                 	all	0.0306
P_200                 	all	0.0153
P_500                 	all	0.0061
P_1000                	all

In [9]:
import collections
import math

# Compute Language Model (Jelinek-Mercer Smoothing)
def compute_language_model(query, lambda_=0.2):
    scores = {}
    words = query.split()
    collection_size = sum(doc_lengths.values())

    for doc_id, text in documents.items():
        doc_words = text.split()
        doc_len = len(doc_words)
        score = 1.0
        for word in words:
            p_doc = doc_words.count(word) / doc_len if doc_len > 0 else 0
            p_coll = sum(inverted_index[word].values()) / collection_size if word in inverted_index else 0
            score *= (1 - lambda_) * p_doc + lambda_ * p_coll
        scores[doc_id] = score

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# Run retrieval and write results
def run_retrieval():
    with open(OUTPUT_FILE_LM, "w") as f:
        new_query_id = 1
        for original_query_id in sorted(queries.keys()):
            query_text = queries[original_query_id]
            lm_scores = compute_language_model(query_text)[:10]
            write_results(f, new_query_id, "LM", lm_scores)
            new_query_id += 1

# Write results in TREC format
def write_results(f, query_id, run_id, scores):
    rank = 1
    for doc_id, score in scores:
        score = 1 if score > 0 else 0  # Convert scores to binary (1 or 0)
        f.write(f"{query_id} 0 {doc_id} {rank} {score} {run_id}\n")
        rank += 1

run_retrieval()

# Check output
!head -n 20 output_trec_lm.txt


1 0 51 1 1 LM
1 0 486 2 1 LM
1 0 573 3 1 LM
1 0 878 4 1 LM
1 0 12 5 1 LM
1 0 329 6 1 LM
1 0 141 7 1 LM
1 0 879 8 1 LM
1 0 746 9 1 LM
1 0 944 10 1 LM
2 0 12 1 1 LM
2 0 746 2 1 LM
2 0 172 3 1 LM
2 0 51 4 1 LM
2 0 1089 5 1 LM
2 0 1380 6 1 LM
2 0 14 7 1 LM
2 0 700 8 1 LM
2 0 810 9 1 LM
2 0 1169 10 1 LM


In [10]:
# Run trec_eval using the relevance file
!trec_eval-9.0.7/trec_eval cranqrel.trec.txt output_trec_lm.txt

runid                 	all	LM
num_q                 	all	225
num_ret               	all	2250
num_rel               	all	1612
num_rel_ret           	all	379
map                   	all	0.1252
gm_map                	all	0.0059
Rprec                 	all	0.1625
bpref                 	all	0.1873
recip_rank            	all	0.3057
iprec_at_recall_0.00  	all	0.3500
iprec_at_recall_0.10  	all	0.3222
iprec_at_recall_0.20  	all	0.2671
iprec_at_recall_0.30  	all	0.1884
iprec_at_recall_0.40  	all	0.1441
iprec_at_recall_0.50  	all	0.1121
iprec_at_recall_0.60  	all	0.0818
iprec_at_recall_0.70  	all	0.0558
iprec_at_recall_0.80  	all	0.0367
iprec_at_recall_0.90  	all	0.0290
iprec_at_recall_1.00  	all	0.0290
P_5                   	all	0.1831
P_10                  	all	0.1684
P_15                  	all	0.1123
P_20                  	all	0.0842
P_30                  	all	0.0561
P_100                 	all	0.0168
P_200                 	all	0.0084
P_500                 	all	0.0034
P_1000                	all	0