In [8]:
import os
import re
import math
from collections import defaultdict

def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

# Load documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
                docs[filename] = preprocess(file.read())
    return docs

# Load queries
def load_queries(query_file_path):
    with open(query_file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file.readlines()]

# Compute statistics needed for BM25 and LM
def compute_statistics(docs):
    doc_count = len(docs)
    doc_lengths = {}
    total_doc_length = 0
    term_freqs = defaultdict(lambda: defaultdict(int))
    doc_freqs = defaultdict(int)
    collection_freqs = defaultdict(int)
    total_collection_length = 0

    for doc_id, words in docs.items():
        length = len(words)
        doc_lengths[doc_id] = length
        total_doc_length += length
        
        # Count words in this doc
        for word in words:
            term_freqs[doc_id][word] += 1
            collection_freqs[word] += 1
            total_collection_length += 1
            
        # Count document frequency (unique words)
        for word in set(words):
            doc_freqs[word] += 1

    avgdl = total_doc_length / doc_count
    
    return term_freqs, doc_freqs, collection_freqs, doc_lengths, avgdl, total_collection_length, doc_count

# BM25 Scoring Function
def score_bm25(query, doc_id, term_freqs, doc_freqs, doc_lengths, avgdl, doc_count):
    k1 = 1.2
    b = 0.75
    score = 0.0
    doc_len = doc_lengths[doc_id]
    
    for term in query:
        if term not in doc_freqs:
            continue
            
        tf = term_freqs[doc_id].get(term, 0)
        df = doc_freqs[term]
        
        # IDF
        idf = math.log((doc_count - df + 0.5) / (df + 0.5) + 1)
        
        # BM25 term weight
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * (doc_len / avgdl))
        
        score += idf * (numerator / denominator)
        
    return score

# LM-JM Scoring Function
def score_lm_jm(query, doc_id, term_freqs, collection_freqs, doc_lengths, total_collection_length):
    lam = 0.5
    score = 0.0
    doc_len = doc_lengths[doc_id]
    
    for term in query:
        # P(t|d)
        tf = term_freqs[doc_id].get(term, 0)
        p_t_d = tf / doc_len if doc_len > 0 else 0
        
        # P(t|C)
        cf = collection_freqs.get(term, 0)
        p_t_c = cf / total_collection_length if total_collection_length > 0 else 0
        
        if p_t_c == 0:
            continue
            
        prob = (1 - lam) * p_t_d + lam * p_t_c
        if prob > 0:
            score += math.log(prob)
            
    return score

# Main retrieval function
def retrieve_documents(folder_path, query_file_path):
    docs = load_documents(folder_path)
    queries = load_queries(query_file_path)
    
    term_freqs, doc_freqs, collection_freqs, doc_lengths, avgdl, total_collection_length, doc_count = compute_statistics(docs)
    
    print("Comparison of Top 5 Documents (BM25 vs LM-JM):")
    print("=" * 80)

    for query in queries:
        query_terms = preprocess(query)
        
        scores_bm25 = {}
        scores_lm = {}
        
        for doc_id in docs:
            scores_bm25[doc_id] = score_bm25(query_terms, doc_id, term_freqs, doc_freqs, doc_lengths, avgdl, doc_count)
            scores_lm[doc_id] = score_lm_jm(query_terms, doc_id, term_freqs, collection_freqs, doc_lengths, total_collection_length)
            
        ranked_bm25 = sorted(scores_bm25.items(), key=lambda item: item[1], reverse=True)
        ranked_lm = sorted(scores_lm.items(), key=lambda item: item[1], reverse=True)
        
        print(f"Query: {query}")
        print(f"{'Rank':<5} | {'BM25 Document':<20} {'Score':<10} | {'LM-JM Document':<20} {'Score':<10}")
        print("-" * 80)
        
        for i in range(5):
            doc_bm25, val_bm25 = ranked_bm25[i]
            doc_lm, val_lm = ranked_lm[i]
            print(f"{i+1:<5} | {doc_bm25:<20} {val_bm25:<10.4f} | {doc_lm:<20} {val_lm:<10.4f}")
        print("=" * 80)
        print()

# Run the code
folder_path = 'Trump Speechs'
query_file_path = 'queries1.txt'
retrieve_documents(folder_path, query_file_path)


Comparison of Top 5 Documents (BM25 vs LM-JM):
Query: to
Rank  | BM25 Document        Score      | LM-JM Document       Score     
--------------------------------------------------------------------------------
1     | speech_30.txt        0.0192     | speech_13.txt        -3.2525   
2     | speech_3.txt         0.0192     | speech_48.txt        -3.3043   
3     | speech_19.txt        0.0192     | speech_41.txt        -3.3043   
4     | speech_41.txt        0.0192     | speech_30.txt        -3.3221   
5     | speech_48.txt        0.0192     | speech_43.txt        -3.3233   

Query: america strong
Rank  | BM25 Document        Score      | LM-JM Document       Score     
--------------------------------------------------------------------------------
1     | speech_2.txt         0.5943     | speech_13.txt        -11.9589  
2     | speech_13.txt        0.5678     | speech_2.txt         -12.3390  
3     | speech_23.txt        0.5503     | speech_46.txt        -12.5966  
4     | speech_46.