In [6]:
import numpy as np 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class SemanticSearchEngine:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.documents = []
        self.document_embeddings = None
    
    def add_documents(self, documents):
        self.documents.extend(documents)
        new_embeddings = self.model.encode(documents)
        
        if self.document_embeddings is None:
            self.document_embeddings = new_embeddings
        else:
            self.document_embeddings = np.vstack([self.document_embeddings, new_embeddings])
    
    def search(self, query, top_k=3):
        if not self.documents:
            return []
        
        query_embedding = self.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.document_embeddings)[0]
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'document': self.documents[idx],
                'similarity': similarities[idx],
                'index': idx
            })
        
        return results

search_engine = SemanticSearchEngine()
documents = [
    "Python is a versatile programming language used for web development, data science, and automation.",
    "Machine learning algorithms can automatically learn patterns from data without explicit programming.",
    "Deep learning is a subset of machine learning that uses neural networks with multiple layers.",
    "Natural language processing helps computers understand and generate human language.",
    "Data visualization is crucial for understanding complex datasets and communicating insights.",
    "Cloud computing provides on-demand access to computing resources over the internet.",
    "Cybersecurity protects digital systems from threats and unauthorized access.",
    "Blockchain technology creates immutable records through distributed consensus."
]
search_engine.add_documents(documents)
queries = [
    "artificial intelligence and learning",
    "programming languages for web development", 
    "protecting computer systems from hackers"
]
for query in queries:
    print(f"\nQuery: '{query}'")
    results = search_engine.search(query, top_k=2)
    for i, result in enumerate(results, 1):
        print(f"{i}. (Score: {result['similarity']:.3f}) {result['document']}")


Query: 'artificial intelligence and learning'
1. (Score: 0.459) Machine learning algorithms can automatically learn patterns from data without explicit programming.
2. (Score: 0.390) Deep learning is a subset of machine learning that uses neural networks with multiple layers.

Query: 'programming languages for web development'
1. (Score: 0.554) Python is a versatile programming language used for web development, data science, and automation.
2. (Score: 0.292) Natural language processing helps computers understand and generate human language.

Query: 'protecting computer systems from hackers'
1. (Score: 0.581) Cybersecurity protects digital systems from threats and unauthorized access.
2. (Score: 0.169) Cloud computing provides on-demand access to computing resources over the internet.
