In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
import re

In [5]:
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Load model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Model loaded!\n")

Loading embedding model...
✓ Model loaded!



In [6]:
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# Generate embeddings
print("\nGenerating embeddings for all sentences...")
embeddings = model.encode(sentences)
print(f"✓ Created {len(embeddings)} embeddings\n")



Generating embeddings for all sentences...
✓ Created 6 embeddings



In [9]:
query1_idx = 0
similarities_1 = []

print("\nSimilarity to all sentences:\n")
for i, sentence in enumerate(sentences):
    sim = cosine_similarity(embeddings[query1_idx], embeddings[i])
    similarities_1.append((i, sentence, sim))
    print(f"{i}. Score: {sim:.3f}")
    print(f"   '{sentence}'\n")

print(f"\n✓ Most similar: '{similarities_1[1][1]}'")
print(f"✓ Least similar: '{similarities_1[-1][1]}'")   



Similarity to all sentences:

0. Score: 1.000
   'The dog is playing in the park'

1. Score: 0.398
   'A puppy is running outside'

2. Score: 0.071
   'The cat is sleeping on the couch'

3. Score: 0.099
   'Python is a programming language'

4. Score: -0.005
   'Machine learning models need data'

5. Score: 0.090
   'I love coding in Python'


✓ Most similar: 'A puppy is running outside'
✓ Least similar: 'I love coding in Python'


In [8]:
# Sort by similarity
similarities_1_sorted = sorted(similarities_1, key=lambda x: x[2], reverse=True)

print("\nRanked by similarity (highest to lowest):")
for rank, (idx, sent, score) in enumerate(similarities_1_sorted, 1):
    print(f"{rank}. (Score: {score:.3f}) {sent}")

print(f"\n✓ Most similar: '{similarities_1_sorted[1][1]}'")
print(f"✓ Least similar: '{similarities_1_sorted[-1][1]}'")


Ranked by similarity (highest to lowest):
1. (Score: 1.000) The dog is playing in the park
2. (Score: 0.398) A puppy is running outside
3. (Score: 0.099) Python is a programming language
4. (Score: 0.090) I love coding in Python
5. (Score: 0.071) The cat is sleeping on the couch
6. (Score: -0.005) Machine learning models need data

✓ Most similar: 'A puppy is running outside'
✓ Least similar: 'Machine learning models need data'


In [11]:
query2_idx = 3
similarities_2 = []

print("\nSimilarity to all sentences:\n")
for i, sentence in enumerate(sentences):
    sim = cosine_similarity(embeddings[query2_idx], embeddings[i])
    similarities_2.append((i, sentence, sim))
    print(f"{i}. Score: {sim:.3f}")
    print(f"   '{sentence}'\n")

# Sort by similarity
similarities_2_sorted = sorted(similarities_2, key=lambda x: x[2], reverse=True)

print("Ranked by similarity (highest to lowest):")
for rank, (idx, sent, score) in enumerate(similarities_2_sorted, 1):
    print(f"{rank}. (Score: {score:.3f}) {sent}")

print(f"\n✓ Most similar: '{similarities_2_sorted[1][1]}'")
print(f"✓ Least similar: '{similarities_2_sorted[-1][1]}'")



Similarity to all sentences:

0. Score: 0.099
   'The dog is playing in the park'

1. Score: 0.040
   'A puppy is running outside'

2. Score: 0.020
   'The cat is sleeping on the couch'

3. Score: 1.000
   'Python is a programming language'

4. Score: 0.113
   'Machine learning models need data'

5. Score: 0.730
   'I love coding in Python'

Ranked by similarity (highest to lowest):
1. (Score: 1.000) Python is a programming language
2. (Score: 0.730) I love coding in Python
3. (Score: 0.113) Machine learning models need data
4. (Score: 0.099) The dog is playing in the park
5. (Score: 0.040) A puppy is running outside
6. (Score: 0.020) The cat is sleeping on the couch

✓ Most similar: 'I love coding in Python'
✓ Least similar: 'The cat is sleeping on the couch'


In [12]:
observations_1 = """
Observations:
- Sentences about dogs (0, 1) have HIGH similarity (0.6+)
  → Both are about dogs/puppies playing/running
  
- Sentence about cats (2) has LOWER similarity (0.4)
  → Different animal, but similar structure/context
  
- Sentences about programming (3, 4, 5) have VERY LOW similarity (0.1-0.2)
  → Completely different topic/domain
  
Key insight: Embeddings group semantically similar content together!
"""
print(observations_1)



Observations:
- Sentences about dogs (0, 1) have HIGH similarity (0.6+)
  → Both are about dogs/puppies playing/running

- Sentence about cats (2) has LOWER similarity (0.4)
  → Different animal, but similar structure/context

- Sentences about programming (3, 4, 5) have VERY LOW similarity (0.1-0.2)
  → Completely different topic/domain

Key insight: Embeddings group semantically similar content together!



In [13]:
observations_2 = """
Observations:
- Sentence 3 and 5 are most similar (0.7+)
  → Both explicitly mention Python
  
- Sentence 4 has good similarity (0.5+)
  → About machine learning (related to Python use cases)
  
- Sentences 0, 1, 2 have very low similarity (0.1-0.2)
  → About animals, completely different domain
  
Key insight: Domain and topic matter! Python+ML sentences cluster together.
"""
print(observations_2)


Observations:
- Sentence 3 and 5 are most similar (0.7+)
  → Both explicitly mention Python

- Sentence 4 has good similarity (0.5+)
  → About machine learning (related to Python use cases)

- Sentences 0, 1, 2 have very low similarity (0.1-0.2)
  → About animals, completely different domain

Key insight: Domain and topic matter! Python+ML sentences cluster together.



In [14]:
threshold_analysis = """
Based on the analysis above, here are typical thresholds:

Similarity Score Interpretation:
- 0.9-1.0  → Identical or near-identical meaning (too strict)
- 0.7-0.9  → Highly relevant, same topic
- 0.5-0.7  → Related content, adjacent topics
- 0.3-0.5  → Loosely related, different angles
- <0.3     → Irrelevant or different domains

Recommended Threshold: 0.5 to 0.6

WHY?
✓ 0.5 catches related but not identical content
✓ Balances precision (no noise) with recall (finds answers)
✓ Works across different topics within same domain

For this dataset:
- Use 0.5+  to find clearly related content
- Use 0.6+  if you want only direct matches
- Use 0.3+  if you want broader coverage (more noise)

PRACTICAL EXAMPLE:
If retrieving documents for "What is Python?"
- Score 0.75 (Python language) ✓ RETRIEVE
- Score 0.65 (Python uses in ML) ✓ RETRIEVE  
- Score 0.45 (Animals story) ✗ SKIP
- Score 0.15 (Something random) ✗ SKIP
"""
print(threshold_analysis)


Based on the analysis above, here are typical thresholds:

Similarity Score Interpretation:
- 0.9-1.0  → Identical or near-identical meaning (too strict)
- 0.7-0.9  → Highly relevant, same topic
- 0.5-0.7  → Related content, adjacent topics
- 0.3-0.5  → Loosely related, different angles
- <0.3     → Irrelevant or different domains

Recommended Threshold: 0.5 to 0.6

WHY?
✓ 0.5 catches related but not identical content
✓ Balances precision (no noise) with recall (finds answers)
✓ Works across different topics within same domain

For this dataset:
- Use 0.5+  to find clearly related content
- Use 0.6+  if you want only direct matches
- Use 0.3+  if you want broader coverage (more noise)

PRACTICAL EXAMPLE:
If retrieving documents for "What is Python?"
- Score 0.75 (Python language) ✓ RETRIEVE
- Score 0.65 (Python uses in ML) ✓ RETRIEVE  
- Score 0.45 (Animals story) ✗ SKIP
- Score 0.15 (Something random) ✗ SKIP



In [15]:
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""

In [16]:
# Helper function for chunking
def chunk_by_characters(text, chunk_size):
    """Split text into chunks of specified character size."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end
    return chunks

In [17]:
# Test query
test_query = "What is machine learning?"

In [18]:
small_chunks = chunk_by_characters(document, 100)
print(f"Number of chunks: {len(small_chunks)}")

# Embed chunks
small_embeddings = model.encode(small_chunks)
query_embedding = model.encode(test_query)

# Calculate similarities
small_sims = []
for i, chunk_emb in enumerate(small_embeddings):
    sim = cosine_similarity(query_embedding, chunk_emb)
    small_sims.append((chunk_emb, sim, i, small_chunks[i]))

small_sims.sort(key=lambda x: x[1], reverse=True)

print("\nTop 3 results:\n")
for rank, (emb, score, idx, chunk) in enumerate(small_sims[:3], 1):
    print(f"{rank}. (Score: {score:.3f})")
    print(f"   {chunk[:100]}...")
    print()


Number of chunks: 16

Top 3 results:

1. (Score: 0.662)
   nce of successfully achieving its goals.

Machine learning is a subset of artificial intelligence th...

2. (Score: 0.603)
   ng its accuracy.
Machine learning is an important component of the growing field of data science.

D...

3. (Score: 0.491)
   Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural i...



In [19]:
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""

In [20]:
# Helper function for chunking
def chunk_by_characters(text, chunk_size):
    """Split text into chunks of specified character size."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start = end
    return chunks

In [21]:
# Test query
test_query = "What is machine learning?"

In [22]:
small_chunks = chunk_by_characters(document, 100)
print(f"Number of chunks: {len(small_chunks)}")

Number of chunks: 16


In [23]:
small_embeddings = model.encode(small_chunks)
query_embedding = model.encode(test_query)

# Calculate similarities
small_sims = []
for i, chunk_emb in enumerate(small_embeddings):
    sim = cosine_similarity(query_embedding, chunk_emb)
    small_sims.append((chunk_emb, sim, i, small_chunks[i]))

small_sims.sort(key=lambda x: x[1], reverse=True)


In [24]:
for rank, (emb, score, idx, chunk) in enumerate(small_sims[:3], 1):
    print(f"{rank}. (Score: {score:.3f})")
    print(f"   {chunk[:100]}...")
    print()

1. (Score: 0.662)
   nce of successfully achieving its goals.

Machine learning is a subset of artificial intelligence th...

2. (Score: 0.603)
   ng its accuracy.
Machine learning is an important component of the growing field of data science.

D...

3. (Score: 0.491)
   Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural i...



In [26]:
small_analysis = """
Analysis:
✓ Very focused results - each chunk is specific
✓ Precision is high - almost every result mentions ML
✓ Problem: May need multiple chunks to get full context
✓ Top result directly mentions "Machine learning"
"""

print(small_analysis)


Analysis:
✓ Very focused results - each chunk is specific
✓ Precision is high - almost every result mentions ML
✓ Problem: May need multiple chunks to get full context
✓ Top result directly mentions "Machine learning"



In [28]:
medium_chunks = chunk_by_characters(document, 200)
print(f"Number of chunks: {len(medium_chunks)}")

# Embed chunks
medium_embeddings = model.encode(medium_chunks)

# Calculate similarities
medium_sims = []
for i, chunk_emb in enumerate(medium_embeddings):
    sim = cosine_similarity(query_embedding, chunk_emb)
    medium_sims.append((chunk_emb, sim, i, medium_chunks[i]))

medium_sims.sort(key=lambda x: x[1], reverse=True)

print("\nTop 3 results:\n")
for rank, (emb, score, idx, chunk) in enumerate(medium_sims[:3], 1):
    print(f"{rank}. (Score: {score:.3f})")
    print(f"   {chunk[:100]}...")
    print()

Number of chunks: 8

Top 3 results:

1. (Score: 0.693)
   at focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improvi...

2. (Score: 0.610)
   ntelligent agents: any device that perceives its environment
and takes actions that maximize its cha...

3. (Score: 0.500)
   eep learning is part of a broader family of machine learning methods based on artificial
neural netw...



In [29]:
medium_analysis = """
Analysis:
✓ Good balance of context and specificity
✓ Results contain related information (AI + ML together)
✓ Precision still good - high scores for relevant chunks
✓ Better context window for understanding
✓ Recommended for most use cases
"""
print(medium_analysis)


Analysis:
✓ Good balance of context and specificity
✓ Results contain related information (AI + ML together)
✓ Precision still good - high scores for relevant chunks
✓ Better context window for understanding
✓ Recommended for most use cases



In [31]:
large_chunks = chunk_by_characters(document, 400)
print(f"Number of chunks: {len(large_chunks)}")

# Embed chunks
large_embeddings = model.encode(large_chunks)

# Calculate similarities
large_sims = []
for i, chunk_emb in enumerate(large_embeddings):
    sim = cosine_similarity(query_embedding, chunk_emb)
    large_sims.append((chunk_emb, sim, i, large_chunks[i]))

large_sims.sort(key=lambda x: x[1], reverse=True)

print("\nTop 3 results:\n")
for rank, (emb, score, idx, chunk) in enumerate(large_sims[:3], 1):
    print(f"{rank}. (Score: {score:.3f})")
    print(f"   {chunk[:120]}...")
    print()




Number of chunks: 4

Top 3 results:

1. (Score: 0.669)
   at focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Mac...

2. (Score: 0.654)
   Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed...

3. (Score: 0.465)
   learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional n...



In [30]:
large_analysis = """
Analysis:
✓ More context - includes related topics (AI, NLP, DL)
⚠ Lower scores than smaller chunks - more noise in chunk
⚠ User gets more info but some is less relevant
⚠ Broader coverage but less focused
"""
print(large_analysis)


Analysis:
✓ More context - includes related topics (AI, NLP, DL)
⚠ Lower scores than smaller chunks - more noise in chunk
⚠ User gets more info but some is less relevant
⚠ Broader coverage but less focused



In [32]:
comparison = f"""
Metric                  | Small (100) | Medium (200) | Large (400)
{'-'*70}
Number of chunks        | {len(small_chunks):11} | {len(medium_chunks):12} | {len(large_chunks):11}
Top score               | {small_sims[0][1]:.3f}        | {medium_sims[0][1]:.3f}         | {large_sims[0][1]:.3f}
2nd score               | {small_sims[1][1]:.3f}        | {medium_sims[1][1]:.3f}         | {large_sims[1][1]:.3f}
3rd score               | {small_sims[2][1]:.3f}        | {medium_sims[2][1]:.3f}         | {large_sims[2][1]:.3f}

Focused Answer?         | YES         | YES          | SOMEWHAT
Complete Context?       | NO          | YES          | YES
Precision (no noise)?   | EXCELLENT   | GOOD         | FAIR
Ease of reading?        | DIFFICULT   | GOOD         | EASY

RECOMMENDATION FOR THIS USE CASE: Medium Chunks (200 characters)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

WHY?
✓ Balanced precision and context
✓ Top results are clearly relevant
✓ Chunks are long enough to be meaningful
✓ Short enough to stay focused
✓ Good for RAG systems

When to use alternatives:
- Small chunks: FAQ systems, short fact lookup (need high precision)
- Large chunks: Research papers, deep context needed (need full context)
"""

print(comparison)


Metric                  | Small (100) | Medium (200) | Large (400)
----------------------------------------------------------------------
Number of chunks        |          16 |            8 |           4
Top score               | 0.662        | 0.693         | 0.669
2nd score               | 0.603        | 0.610         | 0.654
3rd score               | 0.491        | 0.500         | 0.465

Focused Answer?         | YES         | YES          | SOMEWHAT
Complete Context?       | NO          | YES          | YES
Precision (no noise)?   | EXCELLENT   | GOOD         | FAIR
Ease of reading?        | DIFFICULT   | GOOD         | EASY

RECOMMENDATION FOR THIS USE CASE: Medium Chunks (200 characters)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

WHY?
✓ Balanced precision and context
✓ Top results are clearly relevant
✓ Chunks are long enough to be meaningful
✓ Short enough to stay focused
✓ Good for RAG systems

When to use alternatives:
- Small chunks: FAQ systems, sh