In [25]:
import os
import math
import nltk
from collections import defaultdict
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
# Preprocessing function
def preprocess_document(doc):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # Case folding
    doc = doc.lower()
    
    # Tokenization
    tokens = nltk.word_tokenize(doc)
    
    # Remove stop words and non-alphanumeric characters
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    
    return tokens

# Read Documents from Directory
def read_documents_from_directory(directory):
    documents = {}
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r', encoding='utf-8') as file:
                documents[filename] = file.read()
    return documents

# Build Term-Document Matrix (TF-IDF)
def build_term_document_matrix(documents):
    N = len(documents)  # Total number of documents
    term_document_matrix = defaultdict(lambda: defaultdict(int))
    document_frequencies = defaultdict(int)
    document_lengths = defaultdict(float)
    
    # Calculate term frequencies and document frequencies
    for doc_id, doc_content in documents.items():
        tokens = preprocess_document(doc_content)
        term_counts = defaultdict(int)
        
        for token in tokens:
            term_counts[token] += 1
        
        for term, count in term_counts.items():
            term_document_matrix[term][doc_id] = 1 + math.log10(count)  # log(tf)
            document_frequencies[term] += 1
    
    # Calculate document lengths (for cosine normalization)
    for term, doc_list in term_document_matrix.items():
        idf = math.log10(N / document_frequencies[term])  # log(N/df)
        for doc_id, tf in doc_list.items():
            term_document_matrix[term][doc_id] = tf * idf  # tf * idf
            document_lengths[doc_id] += (term_document_matrix[term][doc_id]) ** 2
    
    # Normalize document lengths
    for doc_id in document_lengths:
        document_lengths[doc_id] = math.sqrt(document_lengths[doc_id])
    
    return term_document_matrix, document_lengths, document_frequencies, N

# Normalize query vector
def normalize_query(query_terms, term_document_matrix, document_frequencies, N):
    query_term_counts = defaultdict(int)
    
    for term in query_terms:
        query_term_counts[term] += 1
    
    query_vector = {}
    for term, count in query_term_counts.items():
        tf = 1 + math.log10(count)
        idf = math.log10(N / document_frequencies[term]) if term in document_frequencies else 0
        query_vector[term] = tf * idf
    
    query_length = math.sqrt(sum([v ** 2 for v in query_vector.values()]))
    
    # Normalize the query vector
    if query_length > 0:
        for term in query_vector:
            query_vector[term] /= query_length
    
    return query_vector

# Cosine Similarity Calculation
def cosine_similarity(query_vector, term_document_matrix, document_lengths):
    scores = defaultdict(float)
    
    for term, query_weight in query_vector.items():
        if term in term_document_matrix:
            for doc_id, doc_weight in term_document_matrix[term].items():
                scores[doc_id] += query_weight * doc_weight
    
    for doc_id in scores:
        scores[doc_id] /= document_lengths[doc_id]  # Cosine normalization
    
    return scores

# Search Function to retrieve top 10 documents
def search(query, term_document_matrix, document_lengths, document_frequencies, N):
    query_terms = preprocess_document(query)
    query_vector = normalize_query(query_terms, term_document_matrix, document_frequencies, N)
    
    scores = cosine_similarity(query_vector, term_document_matrix, document_lengths)
    
    # Sort by score and return top 10 results
    ranked_docs = sorted(scores.items(), key=lambda item: (-item[1], item[0]))[:10]
    
    return ranked_docs

# Main Execution
if __name__ == "__main__":
    directory = "C:\\Users\\ASUS\\Desktop\\Corpus"
    
    # Read all documents from the directory
    documents = read_documents_from_directory(directory)
    

    term_document_matrix, document_lengths, document_frequencies, N = build_term_document_matrix(documents)
    
    query = input("Enter your search query: ")
    
    results = search(query, term_document_matrix, document_lengths, document_frequencies, N)
    
    print("Top relevant documents:")
    for rank, (doc_id, score) in enumerate(results, 1):
        print(f"{rank}. ({doc_id}, {score})")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter your search query:  Developing your Zomato business account and profile is a great way to boost your  restaurant’s online reputation


Top relevant documents:
1. (zomato.txt, 0.20259462898399572)
2. (swiggy.txt, 0.1137687853973905)
3. (instagram.txt, 0.0464958687255616)
4. (messenger.txt, 0.04264100643310696)
5. (HP.txt, 0.03619127222176074)
6. (bing.txt, 0.0323311931258012)
7. (youtube.txt, 0.030700663297177078)
8. (flipkart.txt, 0.026232317571210088)
9. (reddit.txt, 0.024544366934365465)
10. (Uber.txt, 0.021324856341440546)
