## 1. Setup

In [1]:
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import pickle

from sentence_transformers import SentenceTransformer

import chromadb
from chromadb.config import Settings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 2. Load Preprocessed Data

In [2]:
DATA_DIR = Path('processed_data')
VECTOR_STORE_DIR = Path('vector_stores')
VECTOR_STORE_DIR.mkdir(exist_ok=True)

chunking_methods = ['page', 'fixed_size', 'sentence']
chunks_data = {}

for method in chunking_methods:
    file_path = DATA_DIR / f'chunks_{method}.json'
    with open(file_path, 'r', encoding='utf-8') as f:
        chunks_data[method] = json.load(f)
    print(f"Loaded {len(chunks_data[method]):4d} chunks from {method}")

print(f"\nTotal chunking strategies: {len(chunking_methods)}")

Loaded  881 chunks from page
Loaded 1540 chunks from fixed_size
Loaded 1577 chunks from sentence

Total chunking strategies: 3


## 3. Initialize Embedding Models

We'll compare two embedding models:
1. **nomic-ai/nomic-embed-text-v1.5**: Modern model with 768 dims, 8192 token context
2. **all-MiniLM-L6-v2**: Classic baseline with 384 dims, 512 token context

In [3]:
embedding_models = {
    'nomic': SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, device='cuda'),
    'minilm': SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
}

for name, model in embedding_models.items():
    test_embedding = model.encode("This is a test sentence.")
    print(f"\n{name} model loaded successfully on GPU")
    print(f"Embedding dimension: {len(test_embedding)}")
    print(f"Sample embedding (first 10 dims): {test_embedding[:10]}")

<All keys matched successfully>



nomic model loaded successfully on GPU
Embedding dimension: 768
Sample embedding (first 10 dims): [ 1.2799681   0.4015842  -3.5162659  -0.39813256  1.5919122   0.36983135
  0.6751001  -0.6361028   0.6404002  -0.42047468]

minilm model loaded successfully on GPU
Embedding dimension: 384
Sample embedding (first 10 dims): [ 0.08429644  0.05795367  0.00449339  0.10582107  0.00708342 -0.0178447
 -0.01688805 -0.01522829  0.04047313  0.03342254]


## 4. Create Embeddings for Each Chunking Strategy

In [4]:
def create_embeddings(chunks, method_name, embedding_model, model_name, batch_size=32):
    print(f"\nCreating embeddings for {method_name} using {model_name}")
    
    texts = [chunk['text'] for chunk in chunks]
    
    embeddings = embedding_model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    print(f"Created {len(embeddings)} embeddings of dimension {embeddings.shape[1]}")
    
    return embeddings

embeddings_data = {}

for model_name, model in embedding_models.items():
    embeddings_data[model_name] = {}
    for method in chunking_methods:
        embeddings = create_embeddings(chunks_data[method], method, model, model_name)
        embeddings_data[model_name][method] = embeddings
        
        embeddings_file = VECTOR_STORE_DIR / f'embeddings_{model_name}_{method}.npy'
        np.save(embeddings_file, embeddings)
        print(f"Saved embeddings to {embeddings_file}")

print("\n")
for model_name in embedding_models.keys():
    print(f"\n{model_name.upper()} embeddings:")
    for method, emb in embeddings_data[model_name].items():
        print(f"  {method:15s}: {emb.shape[0]:4d} embeddings x {emb.shape[1]} dims")


Creating embeddings for page using nomic


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Created 881 embeddings of dimension 768
Saved embeddings to vector_stores\embeddings_nomic_page.npy

Creating embeddings for fixed_size using nomic


Batches:   0%|          | 0/49 [00:00<?, ?it/s]

Created 1540 embeddings of dimension 768
Saved embeddings to vector_stores\embeddings_nomic_fixed_size.npy

Creating embeddings for sentence using nomic


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Created 1577 embeddings of dimension 768
Saved embeddings to vector_stores\embeddings_nomic_sentence.npy

Creating embeddings for page using minilm


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Created 881 embeddings of dimension 384
Saved embeddings to vector_stores\embeddings_minilm_page.npy

Creating embeddings for fixed_size using minilm


Batches:   0%|          | 0/49 [00:00<?, ?it/s]

Created 1540 embeddings of dimension 384
Saved embeddings to vector_stores\embeddings_minilm_fixed_size.npy

Creating embeddings for sentence using minilm


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Created 1577 embeddings of dimension 384
Saved embeddings to vector_stores\embeddings_minilm_sentence.npy



NOMIC embeddings:
  page           :  881 embeddings x 768 dims
  fixed_size     : 1540 embeddings x 768 dims
  sentence       : 1577 embeddings x 768 dims

MINILM embeddings:
  page           :  881 embeddings x 384 dims
  fixed_size     : 1540 embeddings x 384 dims
  sentence       : 1577 embeddings x 384 dims


## 5. Build Vector Store (ChromaDB)

In [5]:
def create_chroma_collection(method_name, model_name, chunks, embeddings):
    print(f"\nCreating ChromaDB collection for {method_name} with {model_name}")
    
    chroma_client = chromadb.PersistentClient(
        path=str(VECTOR_STORE_DIR / 'chroma_db')
    )

    collection_name = f"survival_{model_name}_{method_name}"
    try:
        chroma_client.delete_collection(name=collection_name)
    except:
        pass
    
    collection = chroma_client.create_collection(
        name=collection_name,
        metadata={"chunking_method": method_name, "model": model_name}
    )
    
    ids = [f"{model_name}_{chunk['chunk_id']}" for chunk in chunks]
    documents = [chunk['text'] for chunk in chunks]
    metadatas = [{
        'source': chunk['source'],
        'chunk_method': chunk['chunk_method'],
        'model': model_name
    } for chunk in chunks]
    
    batch_size = 100
    for i in tqdm(range(0, len(ids), batch_size), desc="Adding to ChromaDB"):
        batch_end = min(i + batch_size, len(ids))
        collection.add(
            ids=ids[i:batch_end],
            embeddings=embeddings[i:batch_end].tolist(),
            documents=documents[i:batch_end],
            metadatas=metadatas[i:batch_end]
        )
    
    print(f"Added {len(ids)} documents to collection '{collection.name}'")
    return collection

chroma_collections = {}

for model_name in embedding_models.keys():
    chroma_collections[model_name] = {}
    for method in chunking_methods:
        collection = create_chroma_collection(
            method,
            model_name,
            chunks_data[method],
            embeddings_data[model_name][method]
        )
        chroma_collections[model_name][method] = collection


Creating ChromaDB collection for page with nomic


Adding to ChromaDB:   0%|          | 0/9 [00:00<?, ?it/s]

Added 881 documents to collection 'survival_nomic_page'

Creating ChromaDB collection for fixed_size with nomic


Adding to ChromaDB:   0%|          | 0/16 [00:00<?, ?it/s]

Added 1540 documents to collection 'survival_nomic_fixed_size'

Creating ChromaDB collection for sentence with nomic


Adding to ChromaDB:   0%|          | 0/16 [00:00<?, ?it/s]

Added 1577 documents to collection 'survival_nomic_sentence'

Creating ChromaDB collection for page with minilm


Adding to ChromaDB:   0%|          | 0/9 [00:00<?, ?it/s]

Added 881 documents to collection 'survival_minilm_page'

Creating ChromaDB collection for fixed_size with minilm


Adding to ChromaDB:   0%|          | 0/16 [00:00<?, ?it/s]

Added 1540 documents to collection 'survival_minilm_fixed_size'

Creating ChromaDB collection for sentence with minilm


Adding to ChromaDB:   0%|          | 0/16 [00:00<?, ?it/s]

Added 1577 documents to collection 'survival_minilm_sentence'


## 6. Implement TF-IDF Indices

Create TF-IDF indices for the classic sparse baseline comparison.

In [6]:
def create_tfidf_index(chunks, method_name):
    print(f"\nCreating TF-IDF index for {method_name}")
    
    documents = [chunk['text'] for chunk in chunks]

    tfidf_vectorizer = TfidfVectorizer(
        lowercase=True,
        max_features=10000,  
        ngram_range=(1, 2), 
        min_df=2,
        max_df=0.8
    )
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    
    tfidf_file = VECTOR_STORE_DIR / f'tfidf_{method_name}.pkl'
    with open(tfidf_file, 'wb') as f:
        pickle.dump({
            'vectorizer': tfidf_vectorizer,
            'matrix': tfidf_matrix
        }, f)
    
    print(f"TF-IDF index created with {len(documents)} documents")
    print(f"Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
    print(f"Saved to {tfidf_file}")
    
    return tfidf_vectorizer, tfidf_matrix

tfidf_indices = {}

for method in chunking_methods:
    tfidf_vectorizer, tfidf_matrix = create_tfidf_index(chunks_data[method], method)
    tfidf_indices[method] = {
        'vectorizer': tfidf_vectorizer,
        'matrix': tfidf_matrix
    }


Creating TF-IDF index for page
TF-IDF index created with 881 documents
Vocabulary size: 10000
Saved to vector_stores\tfidf_page.pkl

Creating TF-IDF index for fixed_size
TF-IDF index created with 881 documents
Vocabulary size: 10000
Saved to vector_stores\tfidf_page.pkl

Creating TF-IDF index for fixed_size
TF-IDF index created with 1540 documents
Vocabulary size: 10000
Saved to vector_stores\tfidf_fixed_size.pkl

Creating TF-IDF index for sentence
TF-IDF index created with 1540 documents
Vocabulary size: 10000
Saved to vector_stores\tfidf_fixed_size.pkl

Creating TF-IDF index for sentence
TF-IDF index created with 1577 documents
Vocabulary size: 10000
Saved to vector_stores\tfidf_sentence.pkl
TF-IDF index created with 1577 documents
Vocabulary size: 10000
Saved to vector_stores\tfidf_sentence.pkl


## 7. Implement Retrieval Methods

Compare three retrieval approaches:
1. **Dense (nomic)**: Modern semantic embeddings (nomic-embed-text-v1.5)
2. **Dense (minilm)**: Classic semantic baseline (all-MiniLM-L6-v2)
3. **TF-IDF (sparse)**: Traditional term frequency baseline

In [7]:
class RetrievalSystem:
    
    def __init__(self, method_name, chunks, chroma_collections, tfidf_index, embedding_models):
        self.method_name = method_name
        self.chunks = chunks
        self.chroma_collections = chroma_collections 
        self.tfidf_vectorizer = tfidf_index['vectorizer']
        self.tfidf_matrix = tfidf_index['matrix']
        self.embedding_models = embedding_models 
    
    def dense_retrieval(self, query, model_name='nomic', top_k=5):
        query_embedding = self.embedding_models[model_name].encode([query])[0].tolist()
        
        results = self.chroma_collections[model_name].query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        
        retrieved_docs = []
        for i, (doc_id, doc, distance) in enumerate(zip(
            results['ids'][0],
            results['documents'][0],
            results['distances'][0]
        )):
            similarity = 1 / (1 + distance)
            retrieved_docs.append({
                'rank': i + 1,
                'chunk_id': doc_id,
                'text': doc,
                'score': similarity,
                'method': f'dense_{model_name}'
            })
        
        return retrieved_docs
    
    def tfidf_retrieval(self, query, top_k=5):
        query_vector = self.tfidf_vectorizer.transform([query])
        
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        retrieved_docs = []
        for i, idx in enumerate(top_indices):
            chunk = self.chunks[idx]
            retrieved_docs.append({
                'rank': i + 1,
                'chunk_id': chunk['chunk_id'],
                'text': chunk['text'],
                'score': float(similarities[idx]),
                'method': 'tfidf'
            })
        
        return retrieved_docs
    
    def retrieve(self, query, method='dense_nomic', top_k=5):
        if method == 'dense_nomic':
            return self.dense_retrieval(query, model_name='nomic', top_k=top_k)
        elif method == 'dense_minilm':
            return self.dense_retrieval(query, model_name='minilm', top_k=top_k)
        elif method == 'tfidf':
            return self.tfidf_retrieval(query, top_k=top_k)
        else:
            raise ValueError(f"Unknown method: {method}. Use 'dense_nomic', 'dense_minilm', or 'tfidf'")

retrieval_systems = {}

for method in chunking_methods:
    retrieval_systems[method] = RetrievalSystem(
        method_name=method,
        chunks=chunks_data[method],
        chroma_collections={
            'nomic': chroma_collections['nomic'][method],
            'minilm': chroma_collections['minilm'][method]
        },
        tfidf_index=tfidf_indices[method],
        embedding_models=embedding_models
    )

## 8. Test Retrieval Systems

In [8]:
test_queries = [
    "How do I find water in the desert?",
    "What should I do if I encounter a snake?",
    "How to build a shelter in cold weather?"
]

def display_results(query, results, show_text=True):
    print(f"\nQuery: '{query}'")
    
    for result in results:
        print(f"\nRank {result['rank']} | Score: {result['score']:.4f} | ID: {result['chunk_id']}")
        if show_text:
            text_preview = result['text'][:200] + "..." if len(result['text']) > 200 else result['text']
            print(f"Text: {text_preview}")

test_query = test_queries[0]
test_chunking = 'sentence'

print(f"\nTesting retrieval methods with chunking strategy: {test_chunking}")
print(f"Query: '{test_query}'")

retriever = retrieval_systems[test_chunking]

for retrieval_method in ['dense_nomic', 'dense_minilm', 'tfidf']:
    print(f"\nRetrieval method: {retrieval_method.upper()}")
    
    results = retriever.retrieve(test_query, method=retrieval_method, top_k=3)
    display_results(test_query, results, show_text=True)


Testing retrieval methods with chunking strategy: sentence
Query: 'How do I find water in the desert?'

Retrieval method: DENSE_NOMIC

Query: 'How do I find water in the desert?'

Rank 1 | Score: 0.0063 | ID: nomic_FM3-05-70_sent_156
Text: from cloth. Following are signs to watch for in the desert to help you find water: • All trails lead to water. You should follow in the direction in which the trails converge. Signs of camps, campfire...

Rank 2 | Score: 0.0047 | ID: nomic_FM3-05-70_sent_387
Text: LOW RAINFALL 13-11. Low rainfall is the most obvious environmental factor in an arid area. Some desert areas receive less than 10 centimeters (4 inches) of rain annually, and this rain comes in brief ...

Rank 3 | Score: 0.0047 | ID: nomic_FM21-76_sent_288
Text: • Intense sunlight and heat. • Wide temperature range. • Sparse vegetation. • High mineral content near ground surface. • Sandstorms. • Mirages. Low Rainfall Low rainfall is the most obvious environme...

Retrieval method: DENSE_MI

## 9. Compare All Combinations

In [9]:
test_query = "How to purify water for drinking?"

print(f"Query: '{test_query}'")
print(f"{'Chunking':<15} | {'Retrieval':<15} | {'Top 3 Results (Score | Preview)':<85}")

for chunking in chunking_methods:
    for retrieval_method in ['dense_nomic', 'dense_minilm', 'tfidf']:
        retriever = retrieval_systems[chunking]
        results = retriever.retrieve(test_query, method=retrieval_method, top_k=3)
        
        result_str = ""
        for r in results:
            preview = r['text'][:80].replace('\n', ' ')
            result_str += f"{r['score']:.3f}: {preview}... | "
        
        print(f"{chunking:<15} | {retrieval_method:<15} | {result_str}")

print("=" * 120)

Query: 'How to purify water for drinking?'
Chunking        | Retrieval       | Top 3 Results (Score | Preview)                                                      
page            | dense_nomic     | 0.005: You will need at least three stills to meet your individual daily water intake n... | 0.005: Figure 6-9. Belowground Still to Get Potable Water From Polluted Water WATER PUR... | 0.005: Note: These procedures only clear the water and make it more palatable. You will... | 
page            | dense_minilm    | 0.662: Figure 6-9. Belowground Still to Get Potable Water From Polluted Water WATER PUR... | 0.628: You will need at least three stills to meet your individual daily water intake n... | 0.549: Note: If you do not have a canteen, a cup, a can, or other type of container, im... | 
page            | tfidf           | 0.226: You will need at least three stills to meet your individual daily water intake n... | 0.218: Figure 6-9. Belowground Still to Get Potable Water From Polluted Wa

## 10. Save Retrieval Systems Configuration

In [10]:
config = {
    'embedding_models': {
        'nomic': {
            'name': 'nomic-ai/nomic-embed-text-v1.5',
            'dims': 768,
            'context_length': 8192
        },
        'minilm': {
            'name': 'all-MiniLM-L6-v2',
            'dims': 384,
            'context_length': 512
        }
    },
    'chunking_methods': chunking_methods,
    'retrieval_methods': ['dense_nomic', 'dense_minilm', 'tfidf'],
    'collections': {
        method: {
            'num_chunks': len(chunks_data[method]),
            'nomic_embedding_file': f'embeddings_nomic_{method}.npy',
            'minilm_embedding_file': f'embeddings_minilm_{method}.npy',
            'tfidf_file': f'tfidf_{method}.pkl',
            'chroma_nomic_collection': f'survival_nomic_{method}',
            'chroma_minilm_collection': f'survival_minilm_{method}'
        }
        for method in chunking_methods
    }
}

config_file = VECTOR_STORE_DIR / 'retrieval_config.json'
with open(config_file, 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=2)

print(f"Configuration saved to {config_file}")
print("\nConfiguration:")
print(json.dumps(config, indent=2))

Configuration saved to vector_stores\retrieval_config.json

Configuration:
{
  "embedding_models": {
    "nomic": {
      "name": "nomic-ai/nomic-embed-text-v1.5",
      "dims": 768,
      "context_length": 8192
    },
    "minilm": {
      "name": "all-MiniLM-L6-v2",
      "dims": 384,
      "context_length": 512
    }
  },
  "chunking_methods": [
    "page",
    "fixed_size",
    "sentence"
  ],
  "retrieval_methods": [
    "dense_nomic",
    "dense_minilm",
    "tfidf"
  ],
  "collections": {
    "page": {
      "num_chunks": 881,
      "nomic_embedding_file": "embeddings_nomic_page.npy",
      "minilm_embedding_file": "embeddings_minilm_page.npy",
      "tfidf_file": "tfidf_page.pkl",
      "chroma_nomic_collection": "survival_nomic_page",
      "chroma_minilm_collection": "survival_minilm_page"
    },
    "fixed_size": {
      "num_chunks": 1540,
      "nomic_embedding_file": "embeddings_nomic_fixed_size.npy",
      "minilm_embedding_file": "embeddings_minilm_fixed_size.npy",
    

## 11. Files 

In [11]:
print("Files Created:")
for file in sorted(VECTOR_STORE_DIR.glob('*')):
    if file.is_file():
        size = file.stat().st_size / 1024
        print(f"{file.name:40s} ({size:8.1f} KB)")
    else:
        num_files = len(list(file.rglob('*')))
        print(f"{file.name + '/':<40s} ({num_files} files)")

Files Created:
chroma_db/                               (35 files)
embeddings_minilm_fixed_size.npy         (  2310.1 KB)
embeddings_minilm_page.npy               (  1321.6 KB)
embeddings_minilm_sentence.npy           (  2365.6 KB)
embeddings_nomic_fixed_size.npy          (  4620.1 KB)
embeddings_nomic_page.npy                (  2643.1 KB)
embeddings_nomic_sentence.npy            (  4731.1 KB)
retrieval_config.json                    (     1.5 KB)
tfidf_fixed_size.pkl                     (  3029.8 KB)
tfidf_page.pkl                           (  2337.3 KB)
tfidf_sentence.pkl                       (  2600.6 KB)
