## Step 1: Import Required Libraries

In [None]:
import os  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.metrics.pairwise import cosine_similarity  
import numpy as np  

## Step 2: Load Articles from Text Files

In [None]:
def load_text_files(folder_path):
  
    data = {}
    doc_id_to_filename = {}
    doc_id = 0

    print(f"Scanning folder: {folder_path}")
    for filename in os.listdir(folder_path):
        print(f"Found file: {filename}")  
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                data[doc_id] = content
                doc_id_to_filename[doc_id] = filename
                print(f"Loaded doc_id {doc_id} -> {filename}")
                doc_id += 1

    print(f"Total files loaded: {len(data)}")
    return data, doc_id_to_filename

folder_path = r"C:\Users\Swornim\Documents\College\Information Retrieval\W3"
data, doc_id_to_filename = load_text_files(folder_path)

documents = [data[doc_id] for doc_id in sorted(data.keys())]
document_names = [doc_id_to_filename[doc_id] for doc_id in sorted(data.keys())]

print(f"\nReady for TF-IDF processing!")

✓ Loaded article_1.txt
✓ Loaded article_2.txt
✓ Loaded article_3.txt
✓ Loaded article_4.txt
✓ Loaded article_5.txt
✓ Loaded article_6.txt
✓ Loaded article_7.txt
✓ Loaded article_8.txt

Total articles loaded: 8


## Step 3: Load Queries from File

In [None]:
def load_queries_from_file(filename='queries.txt'):
    queries = []
    
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            queries = [line.strip() for line in file.readlines() if line.strip()]
        print(f"Loaded {len(queries)} queries from {filename}")
    else:
        print(f"File {filename} not found")
    
    return queries

queries = load_queries_from_file()
print("\nQueries:")
for i, query in enumerate(queries, 1):
    print(f"{i}. {query}")

✓ Loaded 5 queries from queries.txt

Queries:
1. fitness tracker technology
2. Apple Watch smartwatch features
3. health monitoring devices
4. sleep quality tracking
5. wearable technology trends


## Step 4: Compute TF-IDF Weights


In [None]:
vectorizer = TfidfVectorizer(
    stop_words='english',  
    lowercase=True,        
    max_features=1000      
)

document_tfidf = vectorizer.fit_transform(documents)

print(f"TF-IDF matrix shape: {document_tfidf.shape}")
print(f"  - {document_tfidf.shape[0]} documents")
print(f"  - {document_tfidf.shape[1]} unique words (features)")
print("\nTF-IDF computation complete!")

TF-IDF matrix shape: (8, 1000)
  - 8 documents
  - 1000 unique words (features)

TF-IDF computation complete!


## Step 5: Transform Queries to TF-IDF Vectors


In [None]:
query_tfidf = vectorizer.transform(queries)

print(f"Query TF-IDF matrix shape: {query_tfidf.shape}")
print(f"  - {query_tfidf.shape[0]} queries")
print(f"  - {query_tfidf.shape[1]} features (same as documents)")
print("\nQuery transformation complete!")

Query TF-IDF matrix shape: (5, 1000)
  - 5 queries
  - 1000 features (same as documents)

Query transformation complete!


## Step 6: Compute Cosine Similarity


In [None]:

similarity_matrix = cosine_similarity(query_tfidf, document_tfidf)

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"  - {similarity_matrix.shape[0]} queries")
print(f"  - {similarity_matrix.shape[1]} documents")
print("\nCosine similarity computation complete!")
print("\nSample similarity scores (Query 1 vs all documents):")
print(similarity_matrix[0])

Similarity matrix shape: (5, 8)
  - 5 queries
  - 8 documents

Cosine similarity computation complete!

Sample similarity scores (Query 1 vs all documents):
[0.13874896 0.02420311 0.         0.00992125 0.         0.
 0.         0.        ]


## Step 7: Rank Documents by Similarity


In [None]:
# Function to display ranked results for each query
def display_ranked_results(queries, similarity_matrix, document_names):

    for query_idx, query in enumerate(queries):
        print("=" * 80)
        print(f"QUERY {query_idx + 1}: {query}")
        print("=" * 80)
        
        scores = similarity_matrix[query_idx]
        
        doc_scores = [(i, scores[i]) for i in range(len(scores))]
        

        ranked_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)
        
        print(f"\nRanked Documents (by relevance):\n")
        for rank, (doc_idx, score) in enumerate(ranked_docs, 1):
            bar_length = int(score * 50)  # Scale to 50 characters max
            bar = '█' * bar_length
            
            print(f"  Rank {rank}: {document_names[doc_idx]}")
            print(f"           Similarity: {score:.4f} {bar}")
            print()
        
        print()
display_ranked_results(queries, similarity_matrix, document_names)

QUERY 1: fitness tracker technology

Ranked Documents (by relevance):

  Rank 1: article_1.txt
           Similarity: 0.1387 ██████

  Rank 2: article_2.txt
           Similarity: 0.0242 █

  Rank 3: article_4.txt
           Similarity: 0.0099 

  Rank 4: article_3.txt
           Similarity: 0.0000 

  Rank 5: article_5.txt
           Similarity: 0.0000 

  Rank 6: article_6.txt
           Similarity: 0.0000 

  Rank 7: article_7.txt
           Similarity: 0.0000 

  Rank 8: article_8.txt
           Similarity: 0.0000 


QUERY 2: Apple Watch smartwatch features

Ranked Documents (by relevance):

  Rank 1: article_1.txt
           Similarity: 0.2177 ██████████

  Rank 2: article_4.txt
           Similarity: 0.1476 ███████

  Rank 3: article_3.txt
           Similarity: 0.0215 █

  Rank 4: article_7.txt
           Similarity: 0.0086 

  Rank 5: article_2.txt
           Similarity: 0.0000 

  Rank 6: article_5.txt
           Similarity: 0.0000 

  Rank 7: article_6.txt
           Similari