In [None]:
# Embedding Analysis & Similarity

This notebook explores:
- Text embeddings for notes
- Similarity calculations between notes
- Clustering and visualization
- Semantic search capabilities

## Use Cases
- Find similar notes
- Identify note clusters/topics
- Improve search relevance
- Discover hidden connections


In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append('../src')

from dotenv import load_dotenv
load_dotenv()

# LangChain imports for embeddings
try:
    from langchain.embeddings import OpenAIEmbeddings, OllamaEmbeddings
    from langchain_google_genai import GoogleGenerativeAIEmbeddings
    print("✅ LangChain embeddings available")
except ImportError:
    print("⚠️  LangChain embeddings not available - install langchain packages")

print("Embedding analysis setup complete!")


In [None]:
# Sample notes for embedding analysis
sample_notes = [
    "Machine learning algorithms can be supervised, unsupervised, or reinforcement learning.",
    "The weather today is sunny and warm, perfect for a walk in the park.",
    "Deep learning uses neural networks with multiple layers to learn complex patterns.",
    "I need to buy groceries: milk, bread, eggs, and vegetables for dinner.",
    "Natural language processing helps computers understand and generate human language.",
    "The meeting is scheduled for 3 PM in the conference room on the second floor.",
    "Artificial intelligence is transforming industries from healthcare to finance.",
    "Don't forget to call mom this weekend and wish her a happy birthday.",
    "Computer vision enables machines to interpret and understand visual information.",
    "The project deadline has been extended by two weeks due to resource constraints."
]

def get_embeddings(texts, embedding_model):
    """Generate embeddings for a list of texts"""
    try:
        embeddings = embedding_model.embed_documents(texts)
        return np.array(embeddings)
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return None

def calculate_similarity_matrix(embeddings):
    """Calculate cosine similarity matrix"""
    return cosine_similarity(embeddings)

def find_similar_notes(query_idx, similarity_matrix, texts, top_k=3):
    """Find most similar notes to a given note"""
    similarities = similarity_matrix[query_idx]
    # Get indices of most similar notes (excluding self)
    similar_indices = np.argsort(similarities)[::-1][1:top_k+1]
    
    results = []
    for idx in similar_indices:
        results.append({
            'index': idx,
            'similarity': similarities[idx],
            'text': texts[idx]
        })
    
    return results

print("Embedding analysis functions ready!")
