In [1]:
import os
import math
import re
import glob
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Uncomment the following lines if necessary
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# Load documents
collection = {}
allfiles = []
path = 'Corpus/'
for ele in glob.glob(path + '*'):
    filename = ele.split('/')[1]
    allfiles.append(filename)
    with open(ele) as f:
        collection[filename] = f.read()

allfiles = set(allfiles)

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def case_folding(text):
    return text.lower()

def tokenization(text):
    return word_tokenize(text)

def normalization(tokens):
    normalized_tokens = [re.sub(r'\W+', '', token).strip() for token in tokens if token]
    return normalized_tokens

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

def lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess(text):
    text = case_folding(text)
    tokens = tokenization(text)
    tokens = normalization(tokens)
    tokens = remove_stopwords(tokens)
    tokens = lemmatization(tokens)
    return tokens

dictionary = defaultdict(list)
doc_lengths = defaultdict(float)
doc_term_freq = defaultdict(lambda: defaultdict(int))
total_docs = len(collection)

In [3]:
# Building the dictionary
for doc_id, content in collection.items():
    tokens = preprocess(content)
    term_freq = defaultdict(int)
    
    for token in tokens:
        term_freq[token] += 1
    
    for term, freq in term_freq.items():
        dictionary[term].append((doc_id, freq))
        doc_term_freq[doc_id][term] = 1 + math.log10(freq)
    
    doc_length = 0
    for freq in term_freq.values():
        doc_length += (1 + math.log10(freq)) ** 2
    doc_lengths[doc_id] = math.sqrt(doc_length)

# Inverse Document Frequency calculation
def calculate_idf(term):
    if term in dictionary:
        df = len(dictionary[term])
        return math.log10(total_docs / df)
    else:
        return 0

# Cosine similarity calculation
def cosine_similarity(query_vector, doc_vector, doc_id):
    dot_product = sum(query_vector[term] * doc_vector.get(term, 0) for term in query_vector)
    query_length = math.sqrt(sum(weight ** 2 for weight in query_vector.values()))
    doc_length = doc_lengths.get(doc_id, 0)
    if query_length * doc_length == 0:
        return 0
    return dot_product / (query_length * doc_length)

def process_query(query):
    tokens = preprocess(query)
    query_term_freq = defaultdict(int)
    for token in tokens:
        query_term_freq[token] += 1
    return query_term_freq

# Ranked retrieval
def ranked_retrieval(query):
    query_term_freq = process_query(query)
    query_vector = {}
    
    # tf-idf
    for term, freq in query_term_freq.items():
        idf = calculate_idf(term)
        query_vector[term] = (1 + math.log10(freq)) * idf
    
    # cosine similarity
    scores = []
    for doc_id in collection:
        doc_vector = doc_term_freq[doc_id]
        score = cosine_similarity(query_vector, doc_vector, doc_id)
        if score > 0:
            scores.append((doc_id, score))
    
    scores = sorted(scores, key=lambda x: (-x[1], x[0]))
    
    return scores[:10]

In [4]:
# Sample test cases
query1 = "Developing your Zomato business account and profile is a great way to boost your restaurant's online reputation"
query2 = "Warwickshire, came from an ancient family and was the heiress to some land"

result1 = ranked_retrieval(query1)
result2 = ranked_retrieval(query2)

def print_results(result):
    for i in result:
        print(i)

print("Query 1 output:")
print_results(result1)
print()
print("Query 2 output:")
print_results(result2)

Query 1 output:
('zomato.txt', 0.2057964082143958)
('swiggy.txt', 0.11943668052188824)
('instagram.txt', 0.0562340462953861)
('messenger.txt', 0.055177109640057447)
('youtube.txt', 0.04539769786533345)
('reddit.txt', 0.044212755005046386)
('bing.txt', 0.0417471942151515)
('flipkart.txt', 0.03938035947139302)
('HP.txt', 0.038484634396480664)
('paypal.txt', 0.03841568305484799)

Query 2 output:
('shakespeare.txt', 0.11891957091796325)
('levis.txt', 0.023773719199982342)
('nike.txt', 0.018298548721953117)
('Adobe.txt', 0.015604571767964412)
('zomato.txt', 0.014989515510215642)
('huawei.txt', 0.0134930594236145)
('skype.txt', 0.011974161946371902)
('blackberry.txt', 0.011331049715597837)
('reliance.txt', 0.010428090643860338)
('Dell.txt', 0.010339228996537778)
