In [3]:
import pandas as pd
import chromadb
from typing import List, Dict
import re
from tqdm import tqdm
import random


def parse_qa_pairs(file_path: str, sample_percentage: float = 0.05) -> List[Dict[str, str]]:
    """
    Parse Q&A pairs from the questions_output.txt file.
    Each Q&A pair becomes a separate document.
    Ensures 5% sampling but keeps all pregnancy-related Q&As.
    """
    qa_pairs = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split by double newlines to separate Q&A pairs
    pairs = content.split('\n\n')
    
    # Filter out empty pairs first
    valid_pairs = [p for p in pairs if p.strip()]
    
    # Separate pregnancy-related pairs from others
    pregnancy_pairs = []
    other_pairs = []
    
    for pair in valid_pairs:
        lines = pair.strip().split('\n')
        question = ""
        answer = ""
        
        for line in lines:
            if line.startswith('Question:'):
                question = line.replace('Question:', '').strip()
            elif line.startswith('Answer:'):
                answer = line.replace('Answer:', '').strip()
        
        if question and answer:
            # Check if pregnancy-related
            is_pregnancy = any(keyword in (question + answer).lower() 
                             for keyword in ['pregnant', 'pregnancy', 'prenatal', 'maternal', 'gestation'])
            
            qa_data = {
                'question': question,
                'answer': answer,
                'is_pregnancy': is_pregnancy
            }
            
            if is_pregnancy:
                pregnancy_pairs.append(qa_data)
            else:
                other_pairs.append(qa_data)
    
    # Calculate sampling for non-pregnancy pairs
    total_other_pairs = len(other_pairs)
    target_sample_size = max(1, int(total_other_pairs * sample_percentage))
    
    print(f"Total Q&A pairs found: {len(valid_pairs)}")
    print(f"Pregnancy-related pairs: {len(pregnancy_pairs)} (keeping all)")
    print(f"Other pairs: {total_other_pairs}")
    print(f"Sampling {target_sample_size} other pairs ({sample_percentage*100:.1f}%)")
    print(f"Total final pairs: {len(pregnancy_pairs) + target_sample_size}")
    
    # Sample other pairs
    sampled_other_pairs = random.sample(other_pairs, min(target_sample_size, len(other_pairs)))
    
    # Combine pregnancy pairs with sampled other pairs
    all_selected_pairs = pregnancy_pairs + sampled_other_pairs
    
    # Process pairs with progress bar
    for i, qa in enumerate(tqdm(all_selected_pairs, desc="Parsing Q&A pairs")):
        qa_pairs.append({
            'question': qa['question'],
            'answer': qa['answer'],
            'id': f'qa_{i}',
            'is_pregnancy': qa['is_pregnancy']
        })
    
    return qa_pairs


def prepare_nutrition_qa_documents(file_path: str, sample_percentage: float = 0.05) -> Dict:
    """
    Convert Q&A pairs into ChromaDB-ready documents.
    Each Q&A pair becomes a searchable document.
    """
    qa_pairs = parse_qa_pairs(file_path, sample_percentage)
    
    documents = []
    metadatas = []
    ids = []
    
    # Process Q&A pairs with progress bar
    for qa in tqdm(qa_pairs, desc="Preparing documents"):
        # Create rich document text for semantic search
        document_text = f"""
        Question: {qa['question']}
        Answer: {qa['answer']}
        
        This Q&A pair provides information about nutrition and health topics.
        """.strip()
        
        # Extract keywords from question for better searchability
        question_words = re.findall(r'\b\w+\b', qa['question'].lower())
        answer_words = re.findall(r'\b\w+\b', qa['answer'].lower())
        all_words = question_words + answer_words
        
        # Create metadata for filtering and exact lookups
        metadata = {
            "is_pregnancy": qa['is_pregnancy']
        }
        
        documents.append(document_text)
        metadatas.append(metadata)
        ids.append(qa['id'])
    
    return {"documents": documents, "metadatas": metadatas, "ids": ids}


def setup_nutrition_qa_chromadb(file_path: str, collection_name: str = "nutrition_qna", sample_percentage: float = 0.05):
    """
    Create and populate ChromaDB collection with nutrition Q&A data.
    Processes documents in chunks of 50 for better performance.
    """
    import time
    
    # Initialize ChromaDB with optimized settings
    client = chromadb.PersistentClient("../chroma")
    
    # Create collection (delete if exists)
    try:
        client.delete_collection(collection_name)
    except:
        pass
    
    collection = client.create_collection(
        name=collection_name,
        metadata={
            "description": "Nutrition Q&A database with questions and answers about nutrition and health"
        },
    )
    
    # Prepare documents
    print(f"Preparing documents with {sample_percentage*100:.1f}% sample...")
    start_time = time.time()
    data = prepare_nutrition_qa_documents(file_path, sample_percentage)
    prep_time = time.time() - start_time
    print(f"Document preparation took {prep_time:.2f} seconds")
    
    # Add to ChromaDB in chunks of 50 with progress bar
    print(f"Adding {len(data['documents'])} documents to ChromaDB in chunks of 50...")
    start_time = time.time()
    
    chunk_size = 50
    total_docs = len(data['documents'])
    
    # Process in chunks with progress bar
    for i in tqdm(range(0, total_docs, chunk_size), desc="Adding documents to ChromaDB"):
        end_idx = min(i + chunk_size, total_docs)
        
        chunk_documents = data['documents'][i:end_idx]
        chunk_metadatas = data['metadatas'][i:end_idx]
        chunk_ids = data['ids'][i:end_idx]
        
        collection.add(
            documents=chunk_documents,
            metadatas=chunk_metadatas,
            ids=chunk_ids
        )
    
    add_time = time.time() - start_time
    print(f"ChromaDB insertion took {add_time:.2f} seconds")
    print(f"Average time per document: {add_time/total_docs:.3f} seconds")
    
    print(f"Added {total_docs} Q&A pairs to ChromaDB collection '{collection_name}'")
    return collection


In [4]:
# Set up the nutrition Q&A collection with 5% sample + all pregnancy Q&As
collection = setup_nutrition_qa_chromadb("../data/questions_output.txt", "nutrition_qna", sample_percentage=0.05)


Preparing documents with 5.0% sample...
Total Q&A pairs found: 45798
Pregnancy-related pairs: 3061 (keeping all)
Other pairs: 42737
Sampling 2136 other pairs (5.0%)
Total final pairs: 5197


Parsing Q&A pairs: 100%|██████████| 5197/5197 [00:00<00:00, 1814819.57it/s]
Preparing documents: 100%|██████████| 5197/5197 [00:00<00:00, 112608.80it/s]


Document preparation took 0.21 seconds
Adding 5197 documents to ChromaDB in chunks of 50...


Adding documents to ChromaDB: 100%|██████████| 104/104 [12:16<00:00,  7.08s/it]

ChromaDB insertion took 736.11 seconds
Average time per document: 0.142 seconds
Added 5197 Q&A pairs to ChromaDB collection 'nutrition_qna'





In [None]:
# Test the setup with sample queries
chroma_client = chromadb.PersistentClient("../chroma")
nutrition_qna = chroma_client.get_collection(name="nutrition_qna")

# Test queries with progress indication
test_queries = [
    ("malnutrition symptoms", 3),
    ("pregnancy nutrition", 2),
    ("biochemical assessment", 2)
]

for query_text, n_results in tqdm(test_queries, desc="Running test queries"):
    print(f"\n=== Query: {query_text} ===")
    results = nutrition_qna.query(query_texts=[query_text], n_results=n_results)
    for i, doc in enumerate(results["documents"][0]):
        print(f"Result {i+1}:")
        print(f"Question: {results['metadatas'][0][i]['question']}")
        print(f"Answer: {results['metadatas'][0][i]['answer']}")
        print(f"Pregnancy-related: {results['metadatas'][0][i]['is_pregnancy']}")
        print("\n")
    
    if query_text != test_queries[-1][0]:  # Don't print separator after last query
        print("="*50)


In [None]:
# Additional test: Search for specific nutrition topics
additional_queries = [("vitamin deficiency", 2)]

for query_text, n_results in tqdm(additional_queries, desc="Running additional tests"):
    print(f"\n=== Query: {query_text} ===")
    results = nutrition_qna.query(query_texts=[query_text], n_results=n_results)
    for i, doc in enumerate(results["documents"][0]):
        print(f"Result {i+1}:")
        print(f"Question: {results['metadatas'][0][i]['question']}")
        print(f"Answer: {results['metadatas'][0][i]['answer']}")
        print(f"Pregnancy-related: {results['metadatas'][0][i]['is_pregnancy']}")
        print("\n")

print("\n" + "="*50 + "\n")

# Show collection statistics
print("Collection Statistics:")
print(f"Collection contains {nutrition_qna.count()} Q&A pairs")

# Count pregnancy-related documents
all_docs = nutrition_qna.get()
pregnancy_count = sum(1 for meta in all_docs['metadatas'] if meta['is_pregnancy'])
print(f"Pregnancy-related Q&As: {pregnancy_count}")
print(f"Regular Q&As: {nutrition_qna.count() - pregnancy_count}")
print(f"Collection metadata: {nutrition_qna.metadata}")


In [None]:
import re
from typing import Dict, List

import chromadb
import pandas as pd
from tqdm import tqdm


def parse_qa_pairs(file_path: str, sample_percentage: float = 0.05) -> List[Dict[str, str]]:
    """
    Parse Q&A pairs from the questions_output.txt file.
    Each Q&A pair becomes a separate document.
    Only processes a sample percentage of the data for faster execution.
    """
    import random
    
    qa_pairs = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split by double newlines to separate Q&A pairs
    pairs = content.split('\n\n')
    
    # Calculate how many pairs to sample
    total_pairs = len([p for p in pairs if p.strip()])
    sample_size = max(1, int(total_pairs * sample_percentage))
    
    print(f"Total Q&A pairs found: {total_pairs}")
    print(f"Sampling {sample_size} pairs ({sample_percentage*100:.1f}%)")
    
    # Filter out empty pairs first
    valid_pairs = [p for p in pairs if p.strip()]
    
    # Randomly sample pairs
    sampled_pairs = random.sample(valid_pairs, min(sample_size, len(valid_pairs)))
    
    # Process pairs with progress bar
    for i, pair in enumerate(tqdm(sampled_pairs, desc="Parsing Q&A pairs")):
        lines = pair.strip().split('\n')
        
        question = ""
        answer = ""
        
        for line in lines:
            if line.startswith('Question:'):
                question = line.replace('Question:', '').strip()
            elif line.startswith('Answer:'):
                answer = line.replace('Answer:', '').strip()
        
        if question and answer:
            qa_pairs.append({
                'question': question,
                'answer': answer,
                'id': f'qa_{i}'
            })
    
    return qa_pairs


def prepare_nutrition_qa_documents(file_path: str, sample_percentage: float = 0.05) -> Dict:
    """
    Convert Q&A pairs into ChromaDB-ready documents.
    Each Q&A pair becomes a searchable document.
    """
    qa_pairs = parse_qa_pairs(file_path, sample_percentage)
    
    documents = []
    metadatas = []
    ids = []
    
    # Process Q&A pairs with progress bar
    for qa in tqdm(qa_pairs, desc="Preparing documents"):
        # Create rich document text for semantic search
        document_text = f"""
        Question: {qa['question']}
        Answer: {qa['answer']}
        
        This Q&A pair provides information about nutrition and health topics.
        """.strip()
        
        # Extract keywords from question for better searchability
        question_words = re.findall(r'\b\w+\b', qa['question'].lower())
        answer_words = re.findall(r'\b\w+\b', qa['answer'].lower())
        all_words = question_words + answer_words
        
        # Create metadata for filtering and exact lookups
        metadata = {
            "question": qa['question'],
            "answer": qa['answer'],
            "question_length": len(qa['question']),
            "answer_length": len(qa['answer']),
            "keywords": " ".join(set(all_words)),
            "has_question_mark": "?" in qa['question'],
            "topic": "nutrition_qa"
        }
        
        documents.append(document_text)
        metadatas.append(metadata)
        ids.append(qa['id'])
    
    return {"documents": documents, "metadatas": metadatas, "ids": ids}


def setup_nutrition_qa_chromadb(file_path: str, collection_name: str = "nutrition_qna", sample_percentage: float = 0.05):
    """
    Create and populate ChromaDB collection with nutrition Q&A data.
    """
    # Initialize ChromaDB
    client = chromadb.PersistentClient("../chroma")
    
    # Create collection (delete if exists)
    try:
        client.delete_collection(collection_name)
    except:
        pass
    
    collection = client.create_collection(
        name=collection_name,
        metadata={
            "description": "Nutrition Q&A database with questions and answers about nutrition and health"
        },
    )
    
    # Prepare documents
    data = prepare_nutrition_qa_documents(file_path, sample_percentage)
    
    # Add to ChromaDB with progress bar
    print("Adding documents to ChromaDB...")
    collection.add(
        documents=data["documents"],
        metadatas=data["metadatas"],
        ids=data["ids"]
    )
    
    print(f"Added {len(data['documents'])} Q&A pairs to ChromaDB collection '{collection_name}'")
    return collection

In [None]:
# Set up the nutrition Q&A collection with 5% sample for faster execution
collection = setup_nutrition_qa_chromadb("../data/questions_output.txt", "nutrition_qna", sample_percentage=0.05)

Total Q&A pairs found: 45798
Sampling 2289 pairs (5.0%)


Parsing Q&A pairs: 100%|██████████| 2289/2289 [00:00<00:00, 374678.50it/s]
Preparing documents: 100%|██████████| 2289/2289 [00:00<00:00, 62248.43it/s]

Adding documents to ChromaDB...





In [13]:
chroma_client = chromadb.PersistentClient("../chroma")
nutrition_qna = chroma_client.get_collection(name="nutrition_qna")
nutrition_qna.count()

0

In [None]:
# Test the setup with sample queries
chroma_client = chromadb.PersistentClient("../chroma")
nutrition_qna = chroma_client.get_collection(name="nutrition_qna")

# Test query 1: Search for malnutrition symptoms
print("=== Query: pregnancy ===")
results = nutrition_qna.query(query_texts=["pregnancy"], n_results=3)
for i, doc in enumerate(results["documents"][0]):
    print(f"Result {i+1}:")
    print(f"Question: {results['metadatas'][0][i]['question']}")
    print(f"Answer: {results['metadatas'][0][i]['answer']}")
    print("\n")

print("\n" + "="*50 + "\n")


=== Query: pregnancy ===


=== Query: biochemical assessment ===


In [None]:
# Additional test: Search for specific nutrition topics
print("=== Query: vitamin deficiency ===")
results = nutrition_qna.query(query_texts=["vitamin deficiency"], n_results=2)
for i, doc in enumerate(results["documents"][0]):
    print(f"Result {i+1}:")
    print(f"Question: {results['metadatas'][0][i]['question']}")
    print(f"Answer: {results['metadatas'][0][i]['answer']}")
    print("\n")

print("\n" + "="*50 + "\n")

# Show collection statistics
print("Collection Statistics:")
print(f"Collection contains {nutrition_qna.count()} Q&A pairs")
print(f"Collection metadata: {nutrition_qna.metadata}")