In [1]:
import os

os.environ['OPENAI_API_KEY'] = "your-key-here"

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')

import chromadb
from chromadb.config import Settings
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision
)
from typing import List, Dict
import openai
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm

In [3]:
# Configuration settings
COLLECTION_NAME = "demo_collection"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
CHROMA_PERSIST_DIR = "./chroma_db"
OPENAI_API_KEY = None  # Set your OpenAI API key here if available
N_RESULTS = 3

# Set OpenAI API key if provided
if os.environ['OPENAI_API_KEY']:
    openai.api_key = os.environ['OPENAI_API_KEY']
    print("üîë OpenAI API key configured")
else:
    print("‚ö†Ô∏è  No OpenAI API key - some evaluation features will be limited")

print("‚öôÔ∏è Configuration set!")


üîë OpenAI API key configured
‚öôÔ∏è Configuration set!


In [4]:
# Initialize ChromaDB client
client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)

# Create or get collection
try:
    collection = client.get_collection(COLLECTION_NAME)
    print(f"‚úÖ Loaded existing collection: {COLLECTION_NAME}")
except:
    collection = client.create_collection(
        name=COLLECTION_NAME,
        metadata={"description": "Demo collection for RAG evaluation"}
    )
    print(f"‚úÖ Created new collection: {COLLECTION_NAME}")

print(f"üìä Current collection size: {collection.count()} documents")

‚úÖ Loaded existing collection: demo_collection
üìä Current collection size: 10 documents


In [5]:
print("üîÑ Loading embedding model...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
print("‚úÖ Embedding model loaded!")

üîÑ Loading embedding model...
‚úÖ Embedding model loaded!


In [6]:
# Sample documents about AI/ML
sample_documents = [
    "Artificial Intelligence (AI) is a branch of computer science that aims to create intelligent machines that work and react like humans.",
    "Machine Learning is a subset of AI that provides systems the ability to automatically learn and improve from experience without being explicitly programmed.",
    "Deep Learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns.",
    "Natural Language Processing (NLP) is a branch of AI that helps computers understand, interpret and manipulate human language.",
    "Computer Vision is a field of AI that trains computers to interpret and understand the visual world from digital images or videos.",
    "Reinforcement Learning is a type of machine learning where an agent learns to make decisions by performing actions in an environment to maximize reward.",
    "Neural Networks are computing systems inspired by biological neural networks that consist of interconnected nodes processing information.",
    "Supervised Learning is a machine learning approach where algorithms learn from labeled training data to make predictions on new data.",
    "Unsupervised Learning looks for patterns in datasets with no pre-existing labels and minimal human supervision.",
    "Feature Engineering is the process of selecting, modifying, or creating features from raw data to improve machine learning model performance."
]

print(f"üìö Prepared {len(sample_documents)} sample documents")

üìö Prepared 10 sample documents


In [None]:
print("üîÑ Adding documents to ChromaDB...")

# Generate embeddings for documents
print("Creating embeddings...")
embeddings = []
for doc in tqdm(sample_documents, desc="Generating embeddings"):
    embedding = embedding_model.encode([doc])[0].tolist()
    embeddings.append(embedding)

# Create metadata
metadatas = [{"source": f"doc_{i}", "type": "ai_ml_info"} for i in range(len(sample_documents))]

# Generate IDs
existing_count = collection.count()
ids = [f"doc_{existing_count + i}" for i in range(len(sample_documents))]

# Add to collection
collection.add(
    documents=sample_documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

print(f"‚úÖ Added {len(sample_documents)} documents to collection")
print(f"üìä Total documents in collection: {collection.count()}")

In [7]:
def retrieve_documents(query: str, n_results: int = N_RESULTS):
    """Retrieve relevant documents for a query"""
    # Generate query embedding
    query_embedding = embedding_model.encode([query]).tolist()
    
    # Search collection
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    
    return {
        'documents': results['documents'][0],
        'metadatas': results['metadatas'][0],
        'distances': results['distances'][0]
    }

# Test retrieval with sample queries
test_queries = [
    "What is machine learning?",
    "neural networks",
    "computer vision applications"
]

print("üîç Testing document retrieval...")
for query in test_queries:
    print(f"\nüìù Query: '{query}'")
    results = retrieve_documents(query, n_results=2)
    
    for i, (doc, metadata, distance) in enumerate(zip(
        results['documents'], results['metadatas'], results['distances']
    )):
        similarity = 1 - distance
        print(f"  {i+1}. [Similarity: {similarity:.3f}] {doc[:100]}...")

üîç Testing document retrieval...

üìù Query: 'What is machine learning?'
  1. [Similarity: 0.612] Machine Learning is a subset of AI that provides systems the ability to automatically learn and impr...
  2. [Similarity: 0.401] Supervised Learning is a machine learning approach where algorithms learn from labeled training data...

üìù Query: 'neural networks'
  1. [Similarity: 0.184] Neural Networks are computing systems inspired by biological neural networks that consist of interco...
  2. [Similarity: 0.060] Deep Learning is a subset of machine learning that uses neural networks with multiple layers to mode...

üìù Query: 'computer vision applications'
  1. [Similarity: 0.222] Computer Vision is a field of AI that trains computers to interpret and understand the visual world ...
  2. [Similarity: -0.446] Deep Learning is a subset of machine learning that uses neural networks with multiple layers to mode...


In [8]:
def generate_answer(query: str, context_docs: List[str]) -> str:
    """Generate answer using retrieved context"""
    context = "\n\n".join(context_docs)
    
    prompt = f"""Based on the following context, answer the question clearly and concisely.

Context:
{context}

Question: {query}

Answer:"""
    
    try:
        if OPENAI_API_KEY:
            # Using OpenAI
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=200,
                temperature=0.1
            )
            return response.choices[0].message.content.strip()
        else:
            # Fallback answer
            return f"Based on the retrieved context, here's information about {query}: " \
                   f"[Using simplified answer generation - add your OpenAI API key for better responses]"
    except Exception as e:
        return f"Error generating answer: {str(e)}"

print("‚úÖ Answer generation function ready!")

‚úÖ Answer generation function ready!


In [9]:
# Sample questions for testing
sample_questions = [
    "What is Artificial Intelligence?",
    "How does Machine Learning work?",
    "What is the difference between supervised and unsupervised learning?"
]

print("ü§ñ Running interactive Q&A session...")

qa_results = []
for question in sample_questions:
    print(f"\n{'='*60}")
    print(f"‚ùì Question: {question}")
    
    # Retrieve relevant documents
    retrieval_results = retrieve_documents(question)
    
    print(f"\nüîç Retrieved {len(retrieval_results['documents'])} relevant documents:")
    for i, doc in enumerate(retrieval_results['documents']):
        print(f"  {i+1}. {doc[:80]}...")
    
    # Generate answer
    answer = generate_answer(question, retrieval_results['documents'])
    
    print(f"\nüéØ Generated Answer:")
    print(f"   {answer}")
    
    # Store for evaluation
    qa_results.append({
        'question': question,
        'answer': answer,
        'contexts': retrieval_results['documents']
    })

print(f"\n‚úÖ Completed {len(qa_results)} Q&A interactions")


ü§ñ Running interactive Q&A session...

‚ùì Question: What is Artificial Intelligence?

üîç Retrieved 3 relevant documents:
  1. Artificial Intelligence (AI) is a branch of computer science that aims to create...
  2. Machine Learning is a subset of AI that provides systems the ability to automati...
  3. Computer Vision is a field of AI that trains computers to interpret and understa...

üéØ Generated Answer:
   Based on the retrieved context, here's information about What is Artificial Intelligence?: [Using simplified answer generation - add your OpenAI API key for better responses]

‚ùì Question: How does Machine Learning work?

üîç Retrieved 3 relevant documents:
  1. Machine Learning is a subset of AI that provides systems the ability to automati...
  2. Supervised Learning is a machine learning approach where algorithms learn from l...
  3. Deep Learning is a subset of machine learning that uses neural networks with mul...

üéØ Generated Answer:
   Based on the retrieved con

In [10]:
# Define ground truth answers for evaluation
ground_truth_data = [
    {
        "question": "What is Artificial Intelligence?",
        "ground_truth": "Artificial Intelligence is a branch of computer science that aims to create intelligent machines that work and react like humans."
    },
    {
        "question": "How does Machine Learning work?", 
        "ground_truth": "Machine Learning provides systems the ability to automatically learn and improve from experience without being explicitly programmed."
    },
    {
        "question": "What is the difference between supervised and unsupervised learning?",
        "ground_truth": "Supervised learning uses labeled training data to make predictions, while unsupervised learning looks for patterns in data with no pre-existing labels."
    }
]

print("üìä Creating evaluation dataset...")

# Create evaluation dataset
eval_data = []
for gt in ground_truth_data:
    question = gt["question"]
    ground_truth = gt["ground_truth"]
    
    # Get retrieval results
    retrieval_results = retrieve_documents(question)
    
    # Generate answer
    answer = generate_answer(question, retrieval_results['documents'])
    
    eval_data.append({
        'question': question,
        'answer': answer,
        'contexts': retrieval_results['documents'],
        'ground_truth': ground_truth
    })

# Convert to RAGAS dataset format
eval_df = pd.DataFrame(eval_data)
eval_dataset = Dataset.from_pandas(eval_df)

print(f"‚úÖ Created evaluation dataset with {len(eval_dataset)} examples")
print("\nDataset preview:")
for i, example in enumerate(eval_dataset):
    print(f"  {i+1}. Q: {example['question'][:50]}...")


üìä Creating evaluation dataset...
‚úÖ Created evaluation dataset with 3 examples

Dataset preview:
  1. Q: What is Artificial Intelligence?...
  2. Q: How does Machine Learning work?...
  3. Q: What is the difference between supervised and unsu...


In [None]:
evaluation_results = evaluate(
        dataset=eval_dataset,
        metrics=metrics,
    )

In [11]:
print("üìà Running RAGAS evaluation...")

try:
    # Define metrics to evaluate
    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision
    ]
    
    print("‚è≥ This may take a few minutes...")
    
    # Run evaluation
    evaluation_results = evaluate(
        dataset=eval_dataset,
        metrics=metrics,
        raise_exceptions=False  # Continue evaluation even if some records fail
    )
    
    print("\nüéâ Evaluation Results:")
    print("=" * 40)
    
    # Display aggregate metrics (overall performance)
    print("üìä AGGREGATE METRICS:")
    print("-" * 25)
    
    metric_names = ['faithfulness', 'answer_relevancy', 'context_recall', 'context_precision']
    aggregate_scores = {}
    
    for metric_name in metric_names:
        scores = evaluation_results[metric_name]
        # Calculate mean score across all records
        valid_scores = [s for s in scores if s is not None and not np.isnan(s)]
        if valid_scores:
            avg_score = np.mean(valid_scores)
            aggregate_scores[metric_name] = avg_score
            
            # Color coding for terminal output
            if avg_score > 0.7:
                status = "üü¢ Excellent"
            elif avg_score > 0.5:
                status = "üü° Good"
            else:
                status = "üî¥ Needs Improvement"
            
            print(f"{metric_name:20s}: {avg_score:.3f} {status}")
    
    # Create detailed per-record results
    print(f"\nüìã PER-RECORD RESULTS SUMMARY:")
    print("-" * 35)
    
    detailed_results = []
    
    for i in range(len(eval_dataset)):
        record_result = {
            'record_id': i,
            'question': eval_dataset['question'][i][:100] + "..." if len(eval_dataset['question'][i]) > 100 else eval_dataset['question'][i]
        }
        
        # Add per-record scores
        for metric_name in metric_names:
            if i < len(evaluation_results[metric_name]):
                score = evaluation_results[metric_name][i]
                record_result[metric_name] = score if score is not None else 'N/A'
            else:
                record_result[metric_name] = 'N/A'
        
        # Calculate average score per record
        scores = [record_result[metric] for metric in metric_names if isinstance(record_result[metric], (int, float))]
        record_result['avg_score'] = np.mean(scores) if scores else 0
        
        detailed_results.append(record_result)
    
    # Overall summary
    print(f"üìä Overall Performance Summary:")
    if aggregate_scores:
        overall_avg = np.mean(list(aggregate_scores.values()))
        print(f"Average Score Across All Metrics: {overall_avg:.3f}")
        
        # Count records by performance level
        excellent_count = sum(1 for r in detailed_results if r['avg_score'] > 0.7)
        good_count = sum(1 for r in detailed_results if 0.5 < r['avg_score'] <= 0.7)
        poor_count = sum(1 for r in detailed_results if r['avg_score'] <= 0.5)
        
        print(f"üü¢ Excellent records (>0.7): {excellent_count}/{len(detailed_results)}")
        print(f"üü° Good records (0.5-0.7): {good_count}/{len(detailed_results)}")
        print(f"üî¥ Poor records (‚â§0.5): {poor_count}/{len(detailed_results)}")
    
    # Save detailed results to variable for further analysis
    per_record_results = detailed_results
    print(f"\nüíæ Per-record results saved to 'per_record_results' variable")
    print(f"   Use per_record_results to analyze individual record performance")
    
except Exception as e:
    print(f"‚ö†Ô∏è Evaluation Error: {e}")
    print("üí° Note: Full RAGAS evaluation requires OpenAI API access for some metrics")
    print("   Set OPENAI_API_KEY in Cell 2 to enable complete evaluation")

üìà Running RAGAS evaluation...
‚è≥ This may take a few minutes...


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:05<00:00,  2.25it/s]



üéâ Evaluation Results:
üìä AGGREGATE METRICS:
-------------------------
faithfulness        : 0.333 üî¥ Needs Improvement
answer_relevancy    : 0.000 üî¥ Needs Improvement
context_recall      : 1.000 üü¢ Excellent
context_precision   : 1.000 üü¢ Excellent

üìã PER-RECORD RESULTS SUMMARY:
-----------------------------------
üìä Overall Performance Summary:
Average Score Across All Metrics: 0.583
üü¢ Excellent records (>0.7): 0/3
üü° Good records (0.5-0.7): 3/3
üî¥ Poor records (‚â§0.5): 0/3

üíæ Per-record results saved to 'per_record_results' variable
   Use per_record_results to analyze individual record performance
