<a href="https://colab.research.google.com/github/vikasch123/Smart_Document_Retrieval/blob/main/Overview_of_Colaboratory_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    """
    Preprocesses text by converting to lowercase, removing stopwords, and tokenizing.

    Args:
        text (str): The input text.

    Returns:
        str: The preprocessed text.
    """
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [token for token in tokens if token.isalnum() and token not in stopwords.words('english')]  # Remove stopwords and punctuation
    return ' '.join(tokens)  # Join tokens back into a string

def load_documents(folder_path):
    """
    Loads documents from a folder and preprocesses them.

    Args:
        folder_path (str): The path to the folder containing documents.

    Returns:
        dict: A dictionary mapping document names to preprocessed content.
    """
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                documents[filename] = preprocess_text(content)
    return documents

def retrieve_relevant_documents(query, documents):
    """
    Retrieves relevant documents based on a user query using TF-IDF and cosine similarity.

    Args:
        query (str): The user's search query.
        documents (dict): A dictionary mapping document names to preprocessed content.

    Returns:
        list: A list of tuples containing document names and their similarity scores, sorted by relevance.
    """
    doc_names = list(documents.keys())
    corpus = list(documents.values())

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Vectorize the query
    query_vector = vectorizer.transform([preprocess_text(query)])

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Rank documents based on similarity
    ranked_docs = sorted(zip(doc_names, similarity_scores), key=lambda x: x[1], reverse=True)

    return ranked_docs

if __name__ == "__main__":
    folder_path = 'documents'  # Replace with the actual path to your documents folder

    documents = load_documents(folder_path)
    if not documents:
        print("No documents found in the folder.")
    else:
        query = input("Enter your search query: ")
        ranked_results = retrieve_relevant_documents(query, documents)

        print("\nTop Relevant Documents:")
        for doc, score in ranked_results[:5]:
            print(f"{doc} (Score: {score:.4f})")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter your search query: machine

Top Relevant Documents:
machine_learning.txt (Score: 0.1492)
data_science.txt (Score: 0.1471)
deep_learning.txt (Score: 0.1268)
artificial_intelligence.txt (Score: 0.0000)
web_development.txt (Score: 0.0000)
