# Semantic Search Pipeline Demo

This notebook demonstrates the core functionality of the semantic search engine:
1. Loading sample data
2. Embedding documents using SBERT
3. Indexing embeddings with FAISS
4. Searching for semantically similar documents
5. Visualizing the embedding space and clusters

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups

# Add the src directory to the path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import our modules
from src.embedder import Embedder
from src.vector_store import VectorStore
from src.clusterer import Clusterer

## 1. Load Sample Data

For this demo, we'll use the 20 Newsgroups dataset, which contains news articles from 20 different categories.

In [None]:
# Load the 20 Newsgroups dataset
categories = ['comp.graphics', 'sci.med', 'rec.autos', 'talk.politics.guns']
newsgroups = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

# Create document dictionaries
documents = []
for i, (text, target) in enumerate(zip(newsgroups.data, newsgroups.target)):
    # Clean up the text a bit
    text = text.strip()
    if len(text) > 100:  # Filter out very short documents
        documents.append({
            "id": i,
            "text": text,
            "category": newsgroups.target_names[target],
            "category_id": target
        })

print(f"Loaded {len(documents)} documents from {len(categories)} categories")

# Display a sample document
sample_idx = np.random.randint(0, len(documents))
sample_doc = documents[sample_idx]
print(f"\nSample document (category: {sample_doc['category']}):\n")
print(sample_doc['text'][:500] + "...")

## 2. Embed Documents using SBERT

Now we'll use our Embedder class to convert the documents into dense vector representations.

In [None]:
# Initialize the embedder
embedder = Embedder(model_name="all-MiniLM-L6-v2")

# Extract the text from documents
texts = [doc["text"] for doc in documents]

# Generate embeddings
embeddings = embedder.embed_texts(texts)

print(f"Generated {len(embeddings)} embeddings with dimension {embeddings.shape[1]}")
print(f"Sample embedding (first 5 dimensions): {embeddings[0][:5]}")

## 3. Index Embeddings with FAISS

Next, we'll use our VectorStore class to index the embeddings for fast similarity search.

In [None]:
# Initialize the vector store
vector_store = VectorStore(
    dimension=embedder.get_embedding_dimension(),
    index_type="Flat",  # Use Flat for exact search
    metric="cosine"
)

# Add the embeddings to the index
vector_store.add(embeddings, documents)

# Display index info
index_info = vector_store.get_index_info()
for key, value in index_info.items():
    print(f"{key}: {value}")

## 4. Search for Similar Documents

Now we can search for documents that are semantically similar to a query.

In [None]:
# Define some test queries
test_queries = [
    "How to treat a high fever",
    "The best sports cars on the market",
    "Rendering 3D graphics with ray tracing",
    "Second amendment rights and gun control laws"
]

# Function to perform a search and display results
def search_and_display(query, k=5):
    print(f"Query: {query}\n")
    
    # Embed the query
    query_embedding = embedder.embed_query(query)
    
    # Search for similar documents
    results = vector_store.search(query_embedding, k=k)
    
    # Display results
    for i, (doc, sim) in enumerate(zip(results["documents"], results["similarities"])):
        print(f"{i+1}. [{doc['category']}] (Similarity: {sim:.4f})")
        print(f"   {doc['text'][:200].replace(chr(10), ' ')}...\n")

# Test each query
for query in test_queries:
    search_and_display(query)
    print("-" * 80)

## 5. Cluster Documents

Let's use our Clusterer class to discover semantic groupings in the documents.

In [None]:
# Initialize the clusterer
clusterer = Clusterer(algorithm="kmeans")

# Fit the clusterer (use the number of actual categories)
labels = clusterer.fit(embeddings, documents, n_clusters=len(categories))

# Evaluate the clustering
score = clusterer.evaluate()
print(f"Silhouette score: {score:.4f}")

# Get cluster info
cluster_info = clusterer.get_cluster_info()
print(f"\nCluster counts:")
for label, count in cluster_info["cluster_counts"].items():
    print(f"Cluster {label}: {count} documents")

## 6. Visualize Embeddings and Clusters

Let's visualize the document embeddings in 2D space using UMAP and color them by category and cluster.

In [None]:
# Reduce dimensions for visualization
reduced_embeddings = clusterer.reduce_dimensions(n_components=2)

# Create a DataFrame for plotting
plot_df = pd.DataFrame({
    'x': reduced_embeddings[:, 0],
    'y': reduced_embeddings[:, 1],
    'category': [doc['category'] for doc in documents],
    'cluster': labels
})

# Plot by true category
plt.figure(figsize=(12, 10))
sns.scatterplot(data=plot_df, x='x', y='y', hue='category', palette='viridis', alpha=0.7)
plt.title('Document Embeddings by Category')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Plot by predicted cluster
plt.figure(figsize=(12, 10))
sns.scatterplot(data=plot_df, x='x', y='y', hue='cluster', palette='tab10', alpha=0.7)
plt.title('Document Embeddings by Cluster')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 7. Analyze Cluster Quality

Let's analyze how well our clusters align with the true categories.

In [None]:
# Create a contingency table of category vs. cluster
contingency = pd.crosstab(plot_df['category'], plot_df['cluster'])

# Display the table
print("Category vs. Cluster Contingency Table:")
print(contingency)

# Plot a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(contingency, annot=True, cmap='YlGnBu', fmt='d')
plt.title('Category vs. Cluster Distribution')
plt.tight_layout()
plt.show()

# Calculate cluster purity
def calculate_purity(contingency):
    return np.sum(np.max(contingency.values, axis=0)) / np.sum(contingency.values)

purity = calculate_purity(contingency)
print(f"Cluster purity: {purity:.4f}")

## 8. Save the Models

Finally, let's save our models for future use.

In [None]:
# Create output directories
output_dir = "../outputs/models"
os.makedirs(output_dir, exist_ok=True)

# Save the vector store
vector_store_path = vector_store.save(output_dir, filename="newsgroups_vector_store")

# Save the clusterer
clusterer_path = clusterer.save(output_dir, filename="newsgroups_clusterer")

print(f"Models saved to {output_dir}")

## Conclusion

In this notebook, we've demonstrated the core functionality of our semantic search engine:

1. We loaded sample documents from the 20 Newsgroups dataset
2. We embedded the documents using SBERT
3. We indexed the embeddings using FAISS
4. We performed semantic searches with natural language queries
5. We clustered the documents to discover semantic groupings
6. We visualized the embedding space and analyzed cluster quality

This demonstrates the power of embedding-based semantic search and clustering for understanding and organizing text data.