In [None]:
!pip install faiss-cpu numpy sentence-transformers

# Document Corpus

In [None]:
# Corpus of 100 sentences
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Python is a versatile programming language used for web development, data analysis, and more.",
    "Artificial intelligence and machine learning are rapidly advancing fields.",
    "She enjoys painting landscapes in her free time.",
    "The capital city of France is Paris, known for its art and culture.",
    "Global warming is a pressing issue affecting the entire planet.",
    "Mount Everest is the highest mountain above sea level.",
    "The human brain is an incredibly complex organ.",
    "Renewable energy sources are essential for a sustainable future.",
    "He plays the guitar in a local band.",
    "Space exploration has led to many technological advancements.",
    "Cooking can be both an art and a science.",
    "The Great Wall of China is a remarkable feat of engineering.",
    "Time management is crucial for productivity.",
    "She read a fascinating book about quantum physics.",
    "Exercise contributes to overall health and well-being.",
    "Languages evolve over time through usage and cultural influences.",
    "The internet has transformed how we communicate and access information.",
    "Biodiversity is vital for ecological balance.",
    "He is studying economics at the university.",
    "The Mona Lisa is one of the most famous paintings in the world.",
    "Advancements in medicine have increased human lifespan.",
    "Traveling exposes individuals to new cultures and ideas.",
    "She adopted a rescue dog from the animal shelter.",
    "Technology is changing the landscape of education.",
    "Climate change impacts weather patterns globally.",
    "Music can evoke powerful emotions.",
    "They are launching a startup in the tech industry.",
    "Recycling helps reduce environmental waste.",
    "Artificial satellites orbit the Earth for communication and observation.",
    "History provides insights into past civilizations.",
    "Mathematics is fundamental to various scientific disciplines.",
    "He enjoys playing chess with his grandfather.",
    "Nutrition plays a key role in maintaining health.",
    "The solar system consists of eight planets orbiting the sun.",
    "Artistic expression varies greatly across different cultures.",
    "She is learning to code in JavaScript.",
    "Economic policies affect national and global markets.",
    "Photography captures moments in time.",
    "Renewable resources are replenished naturally.",
    "The study of genetics has led to breakthroughs in medicine.",
    "He ran a marathon in under four hours.",
    "Ocean currents influence climate and weather patterns.",
    "They visited the museum to see the new exhibit.",
    "Ethical considerations are important in scientific research.",
    "Globalization has connected economies worldwide.",
    "He writes poetry inspired by nature.",
    "The circulatory system transports blood throughout the body.",
    "She practices yoga to improve flexibility and reduce stress.",
    "Astronomy is the study of celestial objects.",
    "They are advocating for social justice reforms.",
    "Computer algorithms can solve complex problems efficiently.",
    "She volunteers at a community garden on weekends.",
    "Physics explains the fundamental laws of the universe.",
    "He enjoys hiking in national parks.",
    "Cultural diversity enriches societies.",
    "Sustainable agriculture supports environmental health.",
    "They attended a workshop on renewable energy.",
    "He is learning to play the piano.",
    "Digital marketing is essential for modern businesses.",
    "They are researching the effects of pollution on marine life.",
    "She is an avid reader of mystery novels.",
    "Urban planning involves designing city layouts.",
    "Cybersecurity is crucial in protecting data.",
    "He collects vintage vinyl records.",
    "They are building a robot for the competition.",
    "Psychology studies human behavior and mind.",
    "She enjoys baking pastries and cakes.",
    "Environmental conservation protects natural habitats.",
    "He is training for a triathlon.",
    "They are developing a new mobile application.",
    "Architecture combines art and engineering.",
    "She is passionate about animal welfare.",
    "Data science involves extracting insights from data.",
    "He plays soccer on the weekends.",
    "They are organizing a charity fundraiser.",
    "Literature reflects the human experience.",
    "She is learning about ancient civilizations.",
    "Marketing strategies vary across industries.",
    "He enjoys photographing wildlife.",
    "They are studying the effects of climate change on agriculture.",
    "She practices meditation daily.",
    "Blockchain technology has potential in various sectors.",
    "He is a fan of classical music.",
    "They are exploring virtual reality applications.",
    "Culinary arts involve creativity and skill.",
    "She is writing a novel set in the future.",
    "Renewable energy includes solar and wind power.",
    "He participates in community theater.",
    "They are advocates for renewable resources.",
    "She studies the impact of social media on society.",
    "Genetics plays a role in inherited traits.",
    "He enjoys woodworking as a hobby.",
    "They are planning a trip around the world.",
    "She is interested in marine biology.",
    "Artificial intelligence can automate tasks.",
    "He collects rare coins from different eras.",
    "They are designing a sustainable home.",
    "She is learning about the human immune system.",
    "Cloud computing provides scalable resources.",
    "He is restoring a classic car.",
    "They are attending a seminar on financial planning.",
    "She enjoys painting abstract art.",
    "Quantum computing could revolutionize technology.",
    "He practices calligraphy in his free time.",
    "They are studying renewable energy policies.",
    "She is an advocate for education reform.",
    "He enjoys exploring ancient ruins.",
    "They are developing software for healthcare.",
    "She is researching the history of art movements.",
    "He plays in a jazz band.",
    "They are investing in green technologies.",
    "She teaches a course on modern literature.",
    "He is learning about space exploration.",
    "They are involved in community outreach programs.",
]


# Indexing

In [15]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Step 2: Embedding Generation
print("Generating embeddings for the documents...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
doc_embeddings = embedding_model.encode(documents, show_progress_bar=True)
doc_embeddings = np.array(doc_embeddings).astype('float32')

# Step 3: Indexing with FAISS
print("Indexing the documents...")
dimension = doc_embeddings.shape[1]
embedding_index = faiss.IndexFlatL2(dimension)
faiss.normalize_L2(doc_embeddings)
embedding_index.add(doc_embeddings)
print("Complete...")


Generating embeddings for the documents...


Batches: 100%|██████████| 4/4 [00:00<00:00,  5.79it/s]

Indexing the documents...
Complete...





# Searching

In [16]:
# Step 4: Query Processing
print("Enter a search query to find the most similar documents.")
user_query = "Artificial intelligence"
query_embedding = embedding_model.encode([user_query])[0]
query_embedding = np.array([query_embedding]).astype('float32')
faiss.normalize_L2(query_embedding)

# Step 5: Similarity Computation
print(f"Finding the most similar documents...")
k = 5  # Number of top results to retrieve
distances, indices = embedding_index.search(query_embedding, k)

# Step 6: Applying Filters (optional)
print("Filtering results...")
results = []
for i, idx in enumerate(indices[0]):
    results.append({
        'id': i,
        'text': documents[idx],
        'distance': distances[0][i]
    })

# Display results
for result in results:
    print(f"Document ID: {result['id']}, Similarity Score: {result['distance']}")
    print(f"Content: {result['text']}\n")


Enter a search query to find the most similar documents.
Finding the most similar documents...
Filtering results...
Document ID: 0, Similarity Score: 0.5827232599258423
Content: Artificial intelligence can automate tasks.

Document ID: 1, Similarity Score: 0.8918582201004028
Content: Artificial intelligence and machine learning are rapidly advancing fields.

Document ID: 2, Similarity Score: 1.123034954071045
Content: Computer algorithms can solve complex problems efficiently.

Document ID: 3, Similarity Score: 1.317882776260376
Content: They are building a robot for the competition.

Document ID: 4, Similarity Score: 1.3612381219863892
Content: Artificial satellites orbit the Earth for communication and observation.

