# Embeddings MVP Example

This notebook demonstrates a simple embedding workflow:
1. Convert text to vector embeddings
2. Compare similarity between texts
3. Find the most similar text from a collection

In [2]:
# MVP Embedding Example - Setup
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")   

# Load environment variables
load_dotenv()

# Initialize embeddings model
print("🔄 Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
print("✅ Embedding model loaded!")

🔄 Loading embedding model...
✅ Embedding model loaded!
✅ Embedding model loaded!


In [3]:
# Step 1: Create a knowledge base of texts
knowledge_base = [
    "Python is a programming language used for web development and data science.",
    "Machine learning is a subset of artificial intelligence that learns from data.",
    "Neural networks are inspired by the human brain and used in deep learning.",
    "JavaScript is primarily used for web development and frontend applications.",
    "Data visualization helps people understand complex datasets through charts and graphs.",
    "Natural language processing enables computers to understand human language.",
    "Databases store and organize data for applications and websites."
]

print("📚 Knowledge Base:")
for i, text in enumerate(knowledge_base, 1):
    print(f"{i}. {text}")
    
print(f"\n📊 Total documents: {len(knowledge_base)}")

📚 Knowledge Base:
1. Python is a programming language used for web development and data science.
2. Machine learning is a subset of artificial intelligence that learns from data.
3. Neural networks are inspired by the human brain and used in deep learning.
4. JavaScript is primarily used for web development and frontend applications.
5. Data visualization helps people understand complex datasets through charts and graphs.
6. Natural language processing enables computers to understand human language.
7. Databases store and organize data for applications and websites.

📊 Total documents: 7


In [4]:
# Step 2: Convert all texts to embeddings
print("🔄 Converting texts to embeddings...")

# Convert each text to embeddings
knowledge_embeddings = []
for i, text in enumerate(knowledge_base):
    embedding = embeddings.embed_query(text)
    knowledge_embeddings.append(embedding)
    print(f"✅ Processed document {i+1}/{len(knowledge_base)}")

# Convert to numpy array for easier manipulation
knowledge_embeddings = np.array(knowledge_embeddings)
print(f"\n📐 Embeddings shape: {knowledge_embeddings.shape}")
print(f"📏 Each embedding has {knowledge_embeddings.shape[1]} dimensions")

🔄 Converting texts to embeddings...
✅ Processed document 1/7
✅ Processed document 2/7
✅ Processed document 3/7
✅ Processed document 4/7
✅ Processed document 5/7
✅ Processed document 6/7
✅ Processed document 7/7

📐 Embeddings shape: (7, 384)
📏 Each embedding has 384 dimensions
✅ Processed document 1/7
✅ Processed document 2/7
✅ Processed document 3/7
✅ Processed document 4/7
✅ Processed document 5/7
✅ Processed document 6/7
✅ Processed document 7/7

📐 Embeddings shape: (7, 384)
📏 Each embedding has 384 dimensions


In [5]:
# Step 3: Search function - Find most similar text
def find_most_similar(query_text, top_k=3):
    """Find the most similar texts to a query"""
    print(f"🔍 Searching for: '{query_text}'")
    
    # Convert query to embedding
    query_embedding = embeddings.embed_query(query_text)
    query_embedding = np.array([query_embedding])  # Reshape for cosine_similarity
    
    # Calculate similarities
    similarities = cosine_similarity(query_embedding, knowledge_embeddings)[0]

    
    # # Get top-k most similar
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    # print(f"\n📋 Top {top_k} most similar texts:")
    results = []
    for i, idx in enumerate(top_indices, 1):
        similarity_score = similarities[idx]
        similar_text = knowledge_base[idx]
        results.append((similarity_score, similar_text))
        print(f"{i}. (Similarity: {similarity_score:.3f}) {similar_text}")
    
    return results

# Test the search function
test_queries = [
    "What is AI and machine learning?",
    "How to build websites?",
    "Understanding data better"
]

for query in test_queries:
    find_most_similar(query, top_k=2)
    print("-" * 80)

🔍 Searching for: 'What is AI and machine learning?'
1. (Similarity: 0.809) Machine learning is a subset of artificial intelligence that learns from data.
2. (Similarity: 0.490) Neural networks are inspired by the human brain and used in deep learning.
--------------------------------------------------------------------------------
🔍 Searching for: 'How to build websites?'
1. (Similarity: 0.355) Databases store and organize data for applications and websites.
2. (Similarity: 0.295) JavaScript is primarily used for web development and frontend applications.
--------------------------------------------------------------------------------
🔍 Searching for: 'Understanding data better'
1. (Similarity: 0.512) Data visualization helps people understand complex datasets through charts and graphs.
2. (Similarity: 0.403) Databases store and organize data for applications and websites.
--------------------------------------------------------------------------------


In [6]:
# Step 4: Interactive search - Try your own queries!
def interactive_search():
    """Interactive search function"""
    print("🎯 Interactive Search Mode")
    print("Type your question and see the most relevant results!")
    print("Type 'quit' to exit\n")
    
    while True:
        user_query = input("💭 Your question: ").strip()
        
        if user_query.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
            
        if not user_query:
            print("Please enter a question!")
            continue
            
        try:
            results = find_most_similar(user_query, top_k=3)
            print()
        except Exception as e:
            print(f"❌ Error: {e}")
            print()

# Uncomment the line below to start interactive mode
# interactive_search()

print("💡 Uncomment the last line above to try interactive search!")
print("💡 Or run: interactive_search()")

💡 Uncomment the last line above to try interactive search!
💡 Or run: interactive_search()


In [None]:
# Advanced: Working with PDF documents and LangChain Vector Store
from langchain_community.vectorstores import DocArrayInMemorySearch  
from langchain_community.document_loaders import PDFMinerLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Define PDF file to process
pdf_file = 'Mehara_Rothila_AI__ML_Engineer.pdf'
file_path = os.path.join('Learning', pdf_file) if os.path.exists(os.path.join('Learning', pdf_file)) else pdf_file


if os.path.exists(file_path):
    
    # Try PDFMinerLoader first (better for complex PDFs)
    loader = PDFMinerLoader(file_path=file_path)
    documents = loader.load()
        
    
    if documents:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", " ", ""]
        )
        
        split_docs = text_splitter.split_documents(documents)
        
        try:
            db = DocArrayInMemorySearch.from_documents(split_docs, embeddings)
            
            def search_pdf_content(query, k=3):
                results = db.similarity_search(query, k=k)
        
                return results
            
            # Example searches relevant to an AI/ML engineer resume
            test_searches = [
                "machine learning experience",
                "Python programming skills", 
                "AI projects worked on",
                "technical skills and expertise"
            ]
            
            for search_query in test_searches:
                search_pdf_content(search_query, k=2)
                print("\n" + "-"*70 + "\n")
        
            
        except Exception as e:
            print(f"Failed to create vector store: {e}")
            
else:
    print(f"Error: PDF")

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-09-24T13:24:37+00:00', 'author': '', 'keywords': '', 'moddate': '2025-09-24T13:24:37+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1', 'subject': '', 'title': '', 'trapped': 'False', 'total_pages': 2, 'source': 'Mehara_Rothila_AI__ML_Engineer.pdf'}, page_content='MEHARA ROTHILA RANAWAKA\n\nAI/ML Engineer\n\n+94 78 710 2992 | rothilamehara22@gmail.com | mehara.io\n(cid:239) ramr-ranawaka | § mehara-rothila\n\nColombo, Western Province, Sri Lanka\n\nOBJECTIVE\n\nB.Sc.(Hons.) IT & Management student at University of Moratuwa. Experienced in machine learning, fraud detection,\nand data analytics with DAX and KQL. Microsoft Certified Fabric Analytics Engineer with a focus on explainable AI\nand interpretable solutions. Founded and led Team "Xforce" - a competitive programming and development team\nachieving consistent su