# Use the Code with Cautions

**The following code implements BIM model in trumps speech to rank document. You should modify the code to implement BM25 model. The speeches are inside folder named *Trump Speeches* and queries to hit are inside queries1.txt**

In [7]:
import os
import re
import numpy as np
from collections import defaultdict
from math import log

# Preprocessing function
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

# Load documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = preprocess(file.read())
    return docs

# Load queries
def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

# Compute term frequencies and document frequencies
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int)
    term_freq = defaultdict(lambda: defaultdict(int))

    for doc_id, words in docs.items():
        word_set = set(words)
        for word in words:
            term_freq[doc_id][word] += 1
        for word in word_set:
            term_doc_freq[word] += 1

    return term_freq, term_doc_freq, doc_count

# Compute relevance probabilities using BIM
def compute_relevance_prob(query, term_freq, term_doc_freq, doc_count):
    scores = {}
    for doc_id in term_freq:
        score = 1.0
        for term in query:
            tf = term_freq[doc_id].get(term, 0)
            df = term_doc_freq.get(term, 0)
            p_term_given_relevant = (tf + 1) / (sum(term_freq[doc_id].values()) + len(term_doc_freq))
            p_term_given_not_relevant = (df + 1) / (doc_count - df + len(term_doc_freq))
            score *= (p_term_given_relevant / p_term_given_not_relevant)
        scores[doc_id] = score
    return scores

# Main retrieval function
def retrieve_documents(folder_path, query_file_path):
    docs = load_documents(folder_path)
    queries = load_queries(query_file_path)

    term_freq, term_doc_freq, doc_count = compute_statistics(docs)

    for query in queries:
        query_terms = preprocess(query)
        scores = compute_relevance_prob(query_terms, term_freq, term_doc_freq, doc_count)
        ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query}")
        for doc_id, score in ranked_docs:
            print(f"Document: {doc_id}, Score: {score:.4f}")
        print()

# Example usage
folder_path = 'Trump Speechs'
query_file_path = 'queries1.txt'
retrieve_documents(folder_path, query_file_path)


Query: to
Document: speech_8.txt, Score: 1.9968
Document: speech_19.txt, Score: 1.9829
Document: speech_7.txt, Score: 1.5065
Document: speech_3.txt, Score: 1.4668
Document: speech_30.txt, Score: 1.3784
Document: speech_6.txt, Score: 1.3783
Document: speech_4.txt, Score: 1.3450
Document: speech_2.txt, Score: 1.3300
Document: speech_10.txt, Score: 1.3095
Document: speech_0.txt, Score: 1.3073
Document: speech_9.txt, Score: 1.2985
Document: speech_11.txt, Score: 1.2504
Document: speech_5.txt, Score: 1.2378
Document: speech_41.txt, Score: 1.2235
Document: speech_48.txt, Score: 1.2034
Document: speech_36.txt, Score: 1.1721
Document: speech_1.txt, Score: 1.1372
Document: speech_18.txt, Score: 1.1205
Document: speech_12.txt, Score: 1.1161
Document: speech_16.txt, Score: 1.1019
Document: speech_45.txt, Score: 1.0818
Document: speech_40.txt, Score: 1.0816
Document: speech_26.txt, Score: 1.0803
Document: speech_37.txt, Score: 1.0710
Document: speech_39.txt, Score: 1.0555
Document: speech_52.txt, 