# Checking the similarity between documents using cosine similarity in a vector space model

## Importing required libraries

In [1]:
import os
import re
import math
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Cleaning and Tokenizing the text from documents

In [2]:
# Defining the text cleaning function
def text_cleaner(text, stem='Stem'):
    # Converting text to lowercase
    text = text.lower()

    # Removing URLs
    text = re.sub(r"http\S+", '', text, flags=re.MULTILINE)

    # Removing non-word and non-whitespace characters
    text = re.sub(r"[^\w\s]", '', text)

    # Removing numbers
    text = re.sub(r"[\d]", '', text)

    # Tokenizing text
    tokens = text.split()

    # Removing stop words
    stop_words = stopwords.words("english")
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming the words if requested
    if stem == 'Stem':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return tokens

In [3]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
    return document.count(term) / len(document)

In [4]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [5]:
# Function to compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [6]:
# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    # Compute norms and handle zero vector case
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0  # If either vector is all zeros, similarity is 0
    dot_product = np.dot(vec1, vec2)
    return dot_product / (norm_vec1 * norm_vec2)

## Main function

In [7]:
def main():
    directory = './dataset/'
    
    # Reading all files from the directory
    docs = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r") as file:
                content = file.read()
                docs.append(content)
                filenames.append(filename)
                
    queries = ["teacher", "Dr.", "teacher", "live"]
    
    # Tokenizing both the documents and queries
    tokenized_docs = [text_cleaner(doc) for doc in docs]
    tokenized_queries = [text_cleaner(query) for query in queries]
    
    # Building the vocabulary
    vocab = sorted(set([word for doc in tokenized_docs for word in doc]))
    
    # Calculate TF-IDF vectors for documents and queries
    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
    query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

    # Calculate cosine similarities
    cosine_similarities = []
    for query_vector in query_tfidf_vectors:
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
        cosine_similarities.append(similarities)
    
    # Writing the ranked results to a text file
    with open("results_sabinthapa.txt", "w") as output_file:
        for i, query in enumerate(queries):
            output_file.write(f"\nCosine similarities for query '{query}':\n")
            
            # Create a list of document similarities with document names
            ranked_docs = sorted(zip(cosine_similarities[i], filenames), reverse=True)
            
            for similarity, filename in ranked_docs:
                output_file.write(f"Document {filename}: {similarity:.4f}\n")

    # Printing the ranking
    for i, query in enumerate(queries):
        print(f"\nCosine similarities for query '{query}' (ranked):")
        
        ranked_docs = sorted(zip(cosine_similarities[i], filenames), reverse=True)
        
        for similarity, filename in ranked_docs:
            print(f"Document {filename}: {similarity:.4f}")

if __name__ == "__main__":
    main()


Cosine similarities for query 'teacher' (ranked):
Document The Teacher’s Legacy.txt: 0.2547
Document The Writer’s Creative Struggle.txt: 0.0000
Document The Surgeon’s Miracle.txt: 0.0000
Document The Scientist’s Groundbreaking Discovery.txt: 0.0000
Document The Engineer's Invention.txt: 0.0000
Document The Discovery of a Hidden Cave.txt: 0.0000
Document The Astronaut’s Journey to Mars.txt: 0.0000
Document The Artist’s Masterpiece.txt: 0.0000
Document Adventure on the High Seas.txt: 0.0000
Document A Day in the Life of a Chef.txt: 0.0000

Cosine similarities for query 'Dr.' (ranked):
Document The Surgeon’s Miracle.txt: 0.2616
Document The Scientist’s Groundbreaking Discovery.txt: 0.2605
Document The Writer’s Creative Struggle.txt: 0.0000
Document The Teacher’s Legacy.txt: 0.0000
Document The Engineer's Invention.txt: 0.0000
Document The Discovery of a Hidden Cave.txt: 0.0000
Document The Astronaut’s Journey to Mars.txt: 0.0000
Document The Artist’s Masterpiece.txt: 0.0000
Document Adve