## Lexical Retrieval

### Demonstrate retrieving the top 5 pasages search using TF-IDF


In [1]:
#load 
# Load and preprocess the Harry Potter text
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')

# Load the text file
with open('harry_potter.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text into passages (using sentences as passages)
passages = sent_tokenize(text)

# Preprocess the passages
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

preprocessed_passages = [preprocess(passage) for passage in passages]

# Create TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(preprocessed_passages)

# Function to search for relevant passages
def search(query, top_k=5):
    # Preprocess the query
    preprocessed_query = preprocess(query)
    
    # Transform the query to TF-IDF vector
    query_vector = vectorizer.transform([preprocessed_query])
    
    # Calculate similarity scores
    similarity_scores = np.dot(tfidf_matrix, query_vector.T).toarray().flatten()
    
    # Get the indices of the top k passages
    top_indices = similarity_scores.argsort()[-top_k:][::-1]
    
    # Return the top passages and their scores
    results = []
    for idx in top_indices:
        results.append({
            'passage': passages[idx],
            'score': similarity_scores[idx]
        })
    
    return results

# Example: Search for a query and display the top 5 passages
query = "magic wand"
results = search(query)

print(f"Top 5 passages for query: '{query}'")
for i, result in enumerate(results):
    print(f"\n{i+1}. Score: {result['score']:.4f}")
    print(f"Passage: {result['passage']}")

ModuleNotFoundError: No module named 'numpy'