<a href="https://colab.research.google.com/github/umesh-ops/Information-Retrieval/blob/main/Umesh_W5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
pip install PyPDF2




In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Step 2: Import necessary libraries
from collections import defaultdict, Counter  # Add this line to your imports
import numpy as np
import re
import os
import PyPDF2
from math import log

In [16]:
# Step 3: Function to preprocess text
def preprocess(text):
    """Preprocesses the text by removing special characters and converting to lowercase."""
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text.lower()  # Convert to lowercase

In [17]:
def load_documents(path):
    """Loads PDF documents from a specified path and preprocesses them."""
    documents = []
    for filename in os.listdir(path):
        if filename.endswith('.pdf'):
            with open(os.path.join(path, filename), 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in reader.pages:
                    text += page.extract_text()
                documents.append(preprocess(text))
    return documents


In [18]:
def load_queries(file_path):
    """Loads queries from a specified text file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        queries = [line.strip() for line in file.readlines()]
    return queries


In [27]:
def compute_statistics(documents):
    """Calculates term frequencies, document frequencies, and document lengths."""
    term_frequency = defaultdict(Counter)
    document_frequency = defaultdict(int)
    document_lengths = []

    for doc in documents:
        words = doc.split()
        doc_length = len(words)
        document_lengths.append(doc_length)

        for word in words:
            term_frequency[word][doc] += 1

        for word in set(words):
            document_frequency[word] += 1

    avg_doc_length = np.mean(document_lengths)
    return term_frequency, document_frequency, document_lengths, avg_doc_length


In [20]:
def compute_corpus_probabilities(documents):
    """Calculates corpus-wide word probabilities for the language model."""
    corpus_frequency = defaultdict(int)
    total_words = 0
    for doc in documents:
        words = doc.split()
        for word in words:
            corpus_frequency[word] += 1
        total_words += len(words)

    corpus_prob = {word: freq / total_words for word, freq in corpus_frequency.items()}
    return corpus_prob


In [21]:
def compute_bm25_scores(query, documents, term_frequency, document_frequency, document_lengths, avg_doc_length, k1=1.5, b=0.75):
    """Computes BM25 scores for the query across documents."""
    scores = {}
    N = len(documents)
    for i, doc in enumerate(documents):
        score = 0
        doc_length = document_lengths[i]
        for word in query.split():
            if word in document_frequency:
                idf = log((N - document_frequency[word] + 0.5) / (document_frequency[word] + 0.5) + 1)
                tf = term_frequency[word][doc]
                score += idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_length / avg_doc_length))))
        scores[i] = score
    return scores


In [22]:
def compute_jm_scores(query, documents, term_frequency, document_length, corpus_word_prob, lambda_param=0.7):
    """Computes Jelinek-Mercer smoothing scores for the query across documents."""
    scores = {}
    for i, doc in enumerate(documents):
        score = 1
        for word in query.split():
            term_prob_doc = term_frequency[word][doc] / document_length[i] if term_frequency[word][doc] else 0
            term_prob_corpus = corpus_word_prob[word] if word in corpus_word_prob else 0
            score *= (lambda_param * term_prob_doc) + ((1 - lambda_param) * term_prob_corpus)
        scores[i] = score
    return scores


In [23]:
# Step 8: Load documents and queries
path_to_documents = "/content/drive/My Drive/AI Documents"
documents = load_documents(path_to_documents)

In [24]:
queries_file_path = "/content/drive/My Drive/AI Documents/queriesnew.txt"
queries = load_queries(queries_file_path)


In [25]:
# Step 9: Compute statistics and corpus probabilities
term_frequency, document_frequency, document_lengths, avg_doc_length = compute_statistics(documents)
corpus_prob = compute_corpus_probabilities(documents)

In [26]:
# Step 10: Calculate BM25 and Jelinek-Mercer Scores
for query in queries:
    bm25_scores = compute_bm25_scores(query, documents, term_frequency, document_frequency, document_lengths, avg_doc_length) # Added avg_doc_length as an argument
    jm_scores = compute_jm_scores(query, documents, term_frequency, document_lengths, corpus_prob)
    print(f"Query: {query}\nBM25 Scores: {bm25_scores}\nJelinek-Mercer Scores: {jm_scores}\n")

Query: What is AI?
BM25 Scores: {0: 0.11221982903548296, 1: 0.11413944782461935, 2: 0.11144204344495817, 3: 0.11401468848626134, 4: 0.11326035146695106, 5: 0.10945505278781677, 6: 0.11291476102508079, 7: 0.11261532870393956, 8: 0.11204917610420269, 9: 0.11137536218403751}
Jelinek-Mercer Scores: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}

Query: How does machine learning work?
BM25 Scores: {0: 2.254091536388064, 1: 1.9055215893938104, 2: 0.0, 3: 2.2878702810816756, 4: 1.4052730522901737, 5: 0.42963049254025504, 6: 0.44007067938111066, 7: 1.2893606073378219, 8: 2.1104267951602402, 9: 2.0564618110662303}
Jelinek-Mercer Scores: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}

Query: What are the ethical implications of AI?
BM25 Scores: {0: 2.2172406309708874, 1: 1.5969338384354987, 2: 0.3418157525850679, 3: 0.3427079865587179, 4: 1.0208853227600605, 5: 0.3335993391895891, 6: 0.9898753159413337, 7: 2.417908888335355, 8: 