In [11]:
import os
import re
import numpy as np
from collections import defaultdict
from math import log

# Preprocessing function
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

# Load documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = preprocess(file.read())
    return docs

# Load queries
def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

# Compute term frequencies and document frequencies
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int)
    term_freq = defaultdict(lambda: defaultdict(int))

    for doc_id, words in docs.items():
        word_set = set(words)
        for word in words:
            term_freq[doc_id][word] += 1
        for word in word_set:
            term_doc_freq[word] += 1

    return term_freq, term_doc_freq, doc_count

# Compute relevance probabilities using BIM
def compute_relevance_prob(query, term_freq, term_doc_freq, doc_count):
    scores = {}
    for doc_id in term_freq:
        score = 0.5
        for term in query:
            tf = term_freq[doc_id].get(term, 0)
            df = term_doc_freq.get(term, 0)
            p_term_given_relevant = (tf + 1) / (sum(term_freq[doc_id].values()) + len(term_doc_freq))
            p_term_given_not_relevant = (df + 1) / (doc_count - df + len(term_doc_freq))
            score *= (p_term_given_relevant / p_term_given_not_relevant)
        scores[doc_id] = score
    return scores

# Main retrieval function
def retrieve_documents(folder_path, query_file_path):
    docs = load_documents(folder_path)
    queries = load_queries(query_file_path)

    term_freq, term_doc_freq, doc_count = compute_statistics(docs)

    for query in queries:
        query_terms = preprocess(query)
        scores = compute_relevance_prob(query_terms, term_freq, term_doc_freq, doc_count)
        ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query}")
        for doc_id, score in ranked_docs:
            print(f"Document: {doc_id}, Score: {score:.4f}")
        print()

# Example usage
# folder_path = './Trump Speechs/'
# query_file_path = './queries1.txt'

folder_path = '../../Final Project/Dataset'
query_file_path = '../../Final Project/queries.txt'
retrieve_documents(folder_path, query_file_path)


Query: Dr.
Document: Middlemarch.txt, Score: 0.4705
Document: Ulysses.txt, Score: 0.3967
Document: The Iliad.txt, Score: 0.0649
Document: Frankenstein: Or, The Modern Prometheus.txt, Score: 0.0646
Document: Romeo and Juliet.txt, Score: 0.0513
Document: The Prince.txt, Score: 0.0395
Document: Second Treatise of Government.txt, Score: 0.0371
Document: The Adventures of Tom Sawyer.txt, Score: 0.0319
Document: Grimms' Fairy Tales.txt, Score: 0.0261
Document: The Count of Monte Cristo.txt, Score: 0.0232

Query: Teacher
Document: The Adventures of Tom Sawyer.txt, Score: 0.0957
Document: The Prince.txt, Score: 0.0593
Document: Frankenstein: Or, The Modern Prometheus.txt, Score: 0.0484
Document: Romeo and Juliet.txt, Score: 0.0384
Document: The Iliad.txt, Score: 0.0365
Document: Middlemarch.txt, Score: 0.0321
Document: Ulysses.txt, Score: 0.0279
Document: Second Treatise of Government.txt, Score: 0.0279
Document: The Count of Monte Cristo.txt, Score: 0.0232
Document: Grimms' Fairy Tales.txt, S

In [12]:
import random

# Function to assign random relevance scores
def assign_random_relevance(queries, documents, relevance_scale=(0, 1)):
    relevance_scores = {}

    for query in queries:
        relevance_scores[query] = {}  # Use the query string directly, no tuple or list
        for doc in documents:
            # Assign a random relevance score between relevance_scale (0 and 1 by default)
            relevance_scores[query][doc] = random.randint(relevance_scale[0], relevance_scale[1])

    return relevance_scores

# Function to save relevance scores to a file
def save_relevance_scores_to_file(relevance_scores, output_file):
    with open(output_file, 'w') as f:
        for query, doc_scores in relevance_scores.items():
            for doc, score in doc_scores.items():
                f.write(f"{query},{doc},{score}\n")  # Write in the format query,document,score


# Example usage:
folder_path = '../../Final Project/Dataset'
query_file_path = '../../Final Project/queries.txt'
# Load documents and queries
documents = load_documents(folder_path)  # This returns a dictionary of doc_id -> content
queries = load_queries(query_file_path)  # This returns a list of queries

# Randomly assign relevance scores (0 for irrelevant, 1 for relevant)
random_relevance_scores = assign_random_relevance(queries, documents.keys())

# Save the relevance scores to query_relevance_score.txt
output_file = 'query_relevance_score.txt'
save_relevance_scores_to_file(random_relevance_scores, output_file)

print(f"Relevance scores saved to {output_file}")


Relevance scores saved to query_relevance_score.txt


In [9]:
import os
import re
import numpy as np
from collections import defaultdict
from math import log
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Preprocessing and cleaning text
def text_cleaner(text, stem='Stem'):
    text = text.lower()  # Converting to lowercase
    text = re.sub(r"http\S+", '', text, flags=re.MULTILINE)  # Removing URLs
    text = re.sub(r"[^\w\s]", '', text)  # Removing non-word, non-whitespace characters
    text = re.sub(r"[\d]", '', text)  # Removing numbers
    cleaned_text = text.split()  # Tokenizing the text
    
    # Removing stop words
    useless_words = stopwords.words("english")
    final_text = [word for word in cleaned_text if word not in useless_words]

    # Applying stemming
    if stem == 'Stem':
        stemmer = PorterStemmer()
        final_text = [stemmer.stem(word) for word in final_text]

    return final_text

# Loading the documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = text_cleaner(file.read())
    return docs

# Loading the queries
def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [text_cleaner(line.strip()) for line in file.readlines()]

# Computing term and document frequencies
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int)
    term_freq = defaultdict(lambda: defaultdict(int))

    for doc_id, words in docs.items():
        word_set = set(words)
        for word in words:
            term_freq[doc_id][word] += 1
        for word in word_set:
            term_doc_freq[word] += 1

    return term_freq, term_doc_freq, doc_count

# Computing inverse document frequencies
def compute_idf(term_doc_freq, doc_count):
    idf = {}
    for term, df in term_doc_freq.items():
        idf[term] = log((doc_count - df + 0.5) / (df + 0.5) + 1)
    return idf

# Computing BM25 scores for documents given a query
def compute_bm25(query, term_freq, idf, doc_count, avgdl, doc_lengths, k1=1.5, b=0.75):
    scores = {}
    for doc_id, doc_terms in term_freq.items():
        score = 0
        doc_length = doc_lengths[doc_id]
        for term in query:
            if term in doc_terms:
                tf = doc_terms[term]
                idf_term = idf.get(term, 0)
                term_score = idf_term * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_length / avgdl))))
                score += term_score
        scores[doc_id] = score
    return scores

# Retrieving and ranking documents using BM25 for given queries
def retrieve_documents_bm25(folder_path, query_file_path):
    docs = load_documents(folder_path)
    queries = load_queries(query_file_path)

    term_freq, term_doc_freq, doc_count = compute_statistics(docs)
    doc_lengths = {doc_id: len(words) for doc_id, words in docs.items()}
    avgdl = np.mean(list(doc_lengths.values()))
    idf = compute_idf(term_doc_freq, doc_count)

    for query in queries:
        scores = compute_bm25(query, term_freq, idf, doc_count, avgdl, doc_lengths)
        ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        
        # Unique formatted output
        print(f"💡 QUERY: {' '.join(query)}")
        print("="*50)
        for doc_id, score in ranked_docs:
            print(f"📄 {doc_id:<30} | 🏆 Score: {score:.4f}")
        print("-"*50 + "\n")

# Function to assign random relevance scores
def assign_random_relevance(queries, documents, relevance_scale=(0, 1)):
    relevance_scores = {}
    for query in queries:
        relevance_scores[query] = {}
        for doc in documents:
            relevance_scores[query][doc] = np.random.randint(relevance_scale[0], relevance_scale[1])
    return relevance_scores

# Function to save relevance scores to a file
def save_relevance_scores_to_file(relevance_scores, output_file):
    with open(output_file, 'w') as f:
        for query, doc_scores in relevance_scores.items():
            for doc, score in doc_scores.items():
                f.write(f"{query},{doc},{score}\n")

# Example usage:
folder_path = '../../Final Project/Dataset'
query_file_path = '../../Final Project/queries.txt'
retrieve_documents_bm25(folder_path, query_file_path)


💡 QUERY: dr
📄 Middlemarch.txt                | 🏆 Score: 1.8931
📄 Ulysses.txt                    | 🏆 Score: 1.8588
📄 The Iliad.txt                  | 🏆 Score: 1.2455
📄 The Count of Monte Cristo.txt  | 🏆 Score: 0.7628
📄 Grimms' Fairy Tales.txt        | 🏆 Score: 0.0000
📄 Romeo and Juliet.txt           | 🏆 Score: 0.0000
📄 Second Treatise of Government.txt | 🏆 Score: 0.0000
📄 The Adventures of Tom Sawyer.txt | 🏆 Score: 0.0000
📄 The Prince.txt                 | 🏆 Score: 0.0000
--------------------------------------------------

💡 QUERY: teacher
📄 The Adventures of Tom Sawyer.txt | 🏆 Score: 0.6494
📄 Middlemarch.txt                | 🏆 Score: 0.4535
📄 The Prince.txt                 | 🏆 Score: 0.4263
📄 Second Treatise of Government.txt | 🏆 Score: 0.4188
📄 The Iliad.txt                  | 🏆 Score: 0.3777
📄 The Count of Monte Cristo.txt  | 🏆 Score: 0.3461
📄 Ulysses.txt                    | 🏆 Score: 0.3351
📄 Grimms' Fairy Tales.txt        | 🏆 Score: 0.0000
📄 Romeo and Juliet.txt           | 🏆 Score