# Model Build - Q&A Evaluator (COMPLETE REFINED VERSION)
## Assignment 11.02 - LLM Applications

### Testing 5 Models:
1. GPT-4 (OpenAI)
2. GPT-4o-mini (OpenAI)
3. Llama-3-8B (HuggingFace)
4. Qwen-2.5-7B (HuggingFace)
5. Gemma-2-9B (HuggingFace)

### Refinements:
- ✅ Includes GPT-4 in model comparison
- ✅ Fixed prompt testing (Cell 11 actually tests prompts)
- ✅ Explained consistency & calibration testing
- ✅ Cells 5-10 adapted for all 5 models

# ============================================================
# 1. Setup and Introduction
# ============================================================

In [1]:
from dotenv import load_dotenv
import os
import json
import time
from typing import Optional, Dict, List
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import numpy as np
 
load_dotenv()

# Verify API keys
if not os.getenv("OPENAI_API_KEY"):
    print("❌ Set OPENAI_API_KEY in .env file")
else:
    print("✅ OpenAI API Key loaded")

if not os.getenv("HF_TOKEN"):
    print("❌ Set HF_TOKEN in .env file")
else:
    print("✅ HuggingFace token loaded")

# Import OpenAI
try:
    from openai import OpenAI
    openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    print("✅ OpenAI client initialized")
except ImportError:
    print("❌ Install: pip install openai")

# Import HuggingFace
try:
    from huggingface_hub import InferenceClient
    hf_client = InferenceClient(token=os.getenv("HF_TOKEN"))
    print("✅ HuggingFace client initialized")
except ImportError:
    print("❌ Install: pip install huggingface_hub")

# Load Q&A database
with open("Q&A_db_practice.json", "r") as f:
    qa_db = json.load(f)
print(f"✅ Loaded {len(qa_db)} questions")

✅ OpenAI API Key loaded
✅ HuggingFace token loaded
✅ OpenAI client initialized
✅ HuggingFace client initialized
✅ Loaded 150 questions


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import comparative analysis module
import sys
sys.path.append('/home/claude')  # or wherever you put the module


# Add these three helper functions
def extract_response_text(response):
    # Dict with 'content' key (your case)
    if isinstance(response, dict) and 'content' in response:
        return response['content']
    # OpenAI response object
    elif hasattr(response, 'choices'):
        return response.choices[0].message.content
    # Already a string
    elif isinstance(response, str):
        return response
    else:
        return str(response)


def clean_json_response(text):
    text = text.strip()
    if "```json" in text:
        text = text.split("```json")[1].split("```")[0].strip()
    elif "```" in text:
        text = text.split("```")[1].split("```")[0].strip()
    return text.strip()  # Extra strip to remove leading newlines

def safe_json_parse(response):
    import json
    # Extract text
    if isinstance(response, dict) and 'content' in response:
        text = response['content']
    elif hasattr(response, 'choices'):
        text = response.choices[0].message.content
    elif isinstance(response, str):
        text = response
    else:
        text = str(response)
    
    # Remove markdown
    if "```json" in text:
        text = text.split("```json")[1].split("```")[0]
    elif "```" in text:
        text = text.split("```")[1].split("```")[0]
    
    # Parse (strip before parsing!)
    return json.loads(text.strip())

# Patch the module
import comparative_analysis
comparative_analysis.safe_json_parse = safe_json_parse
comparative_analysis.extract_response_text = extract_response_text

from comparative_analysis import (
    compare_models,
    visualize_model_comparison,
    compare_prompts,
    visualize_prompt_comparison,
    analyze_interaction,
    visualize_interaction,
    generate_rationale,
    run_full_analysis)

# ============================================================
# 2. Helper Functions Definition
# ============================================================

In [3]:
"""
Helper functions for testing different LLMs
"""

def call_openai_model(model_name: str, messages: List[Dict], max_tokens: int = 500, temperature: float = 0.3) -> Dict:
    """
    Call OpenAI API models (GPT-4, GPT-4o-mini, etc.)
    
    Returns dict with:
    - content: response text
    - latency: time in seconds
    - tokens: actual token count
    - cost: estimated cost in dollars
    """
    start = time.time()
    
    try:
        response = openai_client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature
        )
        
        latency = time.time() - start
        content = response.choices[0].message.content
        
        # Token usage
        tokens_used = response.usage.total_tokens
        
        # Cost estimation (as of 2024)
        if "gpt-4" in model_name.lower() and "mini" not in model_name.lower():
            # GPT-4: $0.03/1K input, $0.06/1K output
            input_cost = (response.usage.prompt_tokens / 1000) * 0.03
            output_cost = (response.usage.completion_tokens / 1000) * 0.06
        else:
            # GPT-4o-mini: $0.00015/1K input, $0.0006/1K output
            input_cost = (response.usage.prompt_tokens / 1000) * 0.00015
            output_cost = (response.usage.completion_tokens / 1000) * 0.0006
        
        total_cost = input_cost + output_cost
        
        return {
            "content": content,
            "latency": latency,
            "tokens": tokens_used,
            "cost": total_cost,
            "success": True
        }
        
    except Exception as e:
        return {
            "content": None,
            "latency": time.time() - start,
            "tokens": 0,
            "cost": 0,
            "success": False,
            "error": str(e)
        }


def call_huggingface_model(model_name: str, messages: List[Dict], max_tokens: int = 500, temperature: float = 0.3) -> Dict:
    """
    Call HuggingFace Inference API models
    
    Returns dict with same structure as call_openai_model
    """
    start = time.time()
    
    try:
        response = hf_client.chat_completion(
            messages=messages,
            model=model_name,
            max_tokens=max_tokens,
            temperature=temperature
        )
        
        latency = time.time() - start
        content = response.choices[0].message.content
        
        # Estimate tokens (rough)
        tokens_used = len(content.split()) * 1.3  # Rough token estimation
        
        return {
            "content": content,
            "latency": latency,
            "tokens": int(tokens_used),
            "cost": 0,  # HF Inference API is free
            "success": True
        }
        
    except Exception as e:
        return {
            "content": None,
            "latency": time.time() - start,
            "tokens": 0,
            "cost": 0,
            "success": False,
            "error": str(e)
        }


def extract_score_from_response(content: str) -> int:
    """
    Extract numerical score from LLM response.
    Looks for JSON or plain score.
    """
    try:
        # Try JSON parsing first
        if "{" in content and "}" in content:
            start = content.find("{")
            end = content.rfind("}") + 1
            json_str = content[start:end]
            data = json.loads(json_str)
            if "final_score_0_100" in data:
                return data["final_score_0_100"]
            if "score_0_100" in data:
                return data["score_0_100"]
            if "model_judgment" in data and "score_0_100" in data["model_judgment"]:
                return data["model_judgment"]["score_0_100"]
        
        # Fallback: look for "Score: XX" pattern
        import re
        match = re.search(r'score[:\s]+(\d+)', content, re.IGNORECASE)
        if match:
            return int(match.group(1))
        
        return None
    except:
        return None

print("✅ Helper functions defined")

✅ Helper functions defined


# ============================================================
# 3. Prompt Definitions 
# ============================================================

Defining different prompts according to the prompt engineering cheat sheet (see files)

In [4]:
PROMPT_V1 = """Evaluate this student answer.

Question: {question}
Target: {target}
Answer: {answer}

Score 0-100 and explain. Return JSON with score_0_100, correctness, completeness, precision, rationale."""


PROMPT_V2 = """### ROLE
You are an expert AI/ML educator evaluating student answers.

### TASK
Evaluate the student's answer by comparing it to the target.

### INPUT DATA
**Question:** {question}

**Target Answer:** {target}

**Student Answer:** {answer}

### EVALUATION CRITERIA
1. **Correctness**: Are core concepts accurate?
2. **Completeness**: Are key aspects covered?
3. **Precision**: Is terminology clear?

### SCORING GUIDE
- 90-100: Excellent
- 70-89: Good
- 50-69: Partial
- 0-49: Poor

### OUTPUT FORMAT
Respond ONLY with JSON:

{{
  "score_0_100": <integer>,
  "correctness": "<1-2 sentences>",
  "completeness": "<1-2 sentences>",
  "precision": "<1-2 sentences>",
  "rationale": ["<point>", "<point>", "<point>"]
}}"""


PROMPT_V3 = """### ROLE
You are an expert AI/ML educator.

### EVALUATION PROCESS
**Step 1:** Analyze correctness
**Step 2:** Assess completeness
**Step 3:** Evaluate precision
**Step 4:** Assign score

### INPUT
**Question:** {question}
**Target:** {target}
**Student:** {answer}

### SCORING
- 90-100: Excellent
- 70-89: Good
- 50-69: Partial
- 0-49: Poor

### OUTPUT
JSON only:
{{
  "score_0_100": <int>,
  "correctness": "<text>",
  "completeness": "<text>",
  "precision": "<text>",
  "rationale": ["<point>", "<point>", "<point>"]
}}"""

PROMPT_V4 = """### ROLE
You are an expert AI/ML educator evaluating student answers.

### TASK
Evaluate on:
1. **Correctness**: Accurate concepts?
2. **Completeness**: Key aspects covered?
3. **Precision**: Clear terminology?

---

### INPUT
**Question:** {question}
**Target:** {target}
**Student:** {answer}

---

### SCORING
- 90-100: Excellent
- 70-89: Good
- 50-69: Partial
- 0-49: Poor

---

### OUTPUT
ONLY valid JSON (no markdown):

{{
  "score_0_100": <integer 0-100>,
  "correctness": "<1-2 sentences>",
  "completeness": "<1-2 sentences>",
  "precision": "<1-2 sentences>",
  "rationale": ["<point>", "<point>", "<point>"]
}}

**IMPORTANT:** Return ONLY the JSON object."""

In [5]:
def call_student_model(model_name: str, question: str) -> str:
    """
    Wrapper function to generate student answers.
    """
    if model_name.startswith("gpt-"):
        # OpenAI model
        messages = [
            {"role": "system", "content": "You are a knowledgeable AI/ML student."},
            {"role": "user", "content": f"Question: {question}\n\nProvide a concise answer."}
        ]
        return call_openai_model(model_name, messages)
    else:
        # HuggingFace model
        prompt = f"Question: {question}\n\nAnswer:"
        return call_huggingface_model(model_name, prompt)


def call_evaluator(evaluator_model: str, prompt_template: str, 
                   question: str, target: str, answer: str) -> str:
    """
    Wrapper function to call evaluator model.
    """
    eval_prompt = prompt_template.format(
        question=question,
        target=target,
        answer=answer
    )
    
    messages = [
        {"role": "system", "content": "You are an expert evaluator."},
        {"role": "user", "content": eval_prompt}
    ]
    
    return call_openai_model(evaluator_model, messages)

In [11]:
# Test HuggingFace model directly
test_response = call_student_model("meta-llama/Meta-Llama-3-8B-Instruct", "What is machine learning?")
print(f"Type: {type(test_response)}")
print(f"Response: {test_response}")

Type: <class 'dict'>
Response: {'content': None, 'latency': 0.4899141788482666, 'tokens': 0, 'cost': 0, 'success': False, 'error': '(Request ID: Root=1-690f663a-0ae7d83a116231e4213a36a9;5e172f12-a3b9-4f18-9b18-461c52dbf078)\n\nBad request:'}


# ============================================================
# CELL 3: Test OpenAI Models (GPT-4 and GPT-4o-mini)
# ============================================================

In [10]:
# ============================================================
# COMPARATIVE ANALYSIS: MODEL COMPARISON
# ============================================================

# Define models to compare
test_models = [
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    "google/gemma-2-9b-it", 
    "gpt-4o-mini",
    "gpt-3.5-turbo"
]

# Run model comparison
model_df, model_details = compare_models(
    models=test_models,
    qa_db=qa_db,
    prompt_template=PROMPT_V4,  # Use your best prompt
    evaluator_model="gpt-4o",
    call_model_func=call_student_model,
    call_evaluator_func=call_evaluator,
    n_questions=10,
    seed=42
)


# Visualize results
visualize_model_comparison(model_df, model_details)

MODEL COMPARISON ANALYSIS

Testing 5 models on 10 questions
Evaluator: gpt-4o


Testing: meta-llama/Meta-Llama-3-8B-Instruct
----------------------------------------
  Q1: 0
  Q2: 0
  Q3: 0
  Q4: 0
  Q5: 0
  Q6: 0
  Q7: 0


KeyboardInterrupt: 

In [None]:
# ============================================================
# COMPARATIVE ANALYSIS: PROMPT COMPARISON
# ============================================================

# Define all prompt versions
prompt_versions = {
    'PROMPT_V1': PROMPT_V1,
    'PROMPT_V2': PROMPT_V2,
    'PROMPT_V3': PROMPT_V3,
    'PROMPT_V4': PROMPT_V4
}

# Select best model from previous analysis
best_model = model_df.loc[model_df['mean_score'].idxmax(), 'model']
print(f"Using best model: {best_model}")

# Run prompt comparison
prompt_df, prompt_details = compare_prompts(
    prompt_versions=prompt_versions,
    model_name=best_model,
    qa_db=qa_db,
    evaluator_model="gpt-4o",
    call_model_func=call_student_model,
    call_evaluator_func=call_evaluator,
    n_questions=10,
    seed=42
)

# Visualize results
visualize_prompt_comparison(prompt_df, prompt_details, baseline='PROMPT_V1')

In [None]:
# ============================================================
# COMPARATIVE ANALYSIS: MODEL-PROMPT INTERACTION
# ============================================================

# Select top 3 models
top_models = model_df.nlargest(3, 'mean_score')['model'].tolist()
print(f"Testing top models: {top_models}")

# Run interaction analysis
interaction_df = analyze_interaction(
    models=top_models,
    prompt_versions=prompt_versions,
    qa_db=qa_db,
    evaluator_model="gpt-4o",
    call_model_func=call_student_model,
    call_evaluator_func=call_evaluator,
    n_questions=5,  # Fewer questions due to many combinations
    seed=42
)

# Visualize results
visualize_interaction(interaction_df)

# ============================================================
# COMPARATIVE ANALYSIS: EVIDENCE-BASED RATIONALE
# ============================================================

In [None]:


generate_rationale(
    model_df=model_df,
    prompt_df=prompt_df,
    interaction_df=interaction_df
)

In [None]:
# ============================================================
# COMPREHENSIVE ANALYSIS (ALL-IN-ONE)
# ============================================================

models_to_test = ["gpt-4o-mini", "gpt-3.5-turbo"]

prompt_versions = {
    'PROMPT_V1': PROMPT_V1,
    'PROMPT_V2': PROMPT_V2,
    'PROMPT_V3': PROMPT_V3,
    'PROMPT_V4': PROMPT_V4
}

results = run_full_analysis(
    models=models_to_test,
    prompt_versions=prompt_versions,
    qa_db=qa_db,
    evaluator_model="gpt-4o",
    call_model_func=call_student_model,
    call_evaluator_func=call_evaluator,
    n_questions_model=10,
    n_questions_prompt=10,
    n_questions_interaction=5
)

# Access results
print(f"\nBest Model: {results['best_model']}")
print(f"Best Prompt: {results['best_prompt']}")

## From before 

In [7]:
"""
Test OpenAI models first
"""

# Test case
test_question = qa_db[0]["question"]
test_target = qa_db[0]["answer"]
test_answer = test_target[:200] + " This is a simplified explanation."

# Simple evaluation prompt for initial comparison
SIMPLE_EVAL_PROMPT = """You are an AI educator evaluating student answers.

Question: {question}
Target Answer: {target}
Student Answer: {answer}

Evaluate the answer and respond with JSON:
{{
  "score_0_100": 85,
  "correctness": "Brief assessment",
  "completeness": "Brief assessment",
  "precision": "Brief assessment",
  "rationale": ["Point 1", "Point 2", "Point 3"]
}}

Score: 90-100 excellent, 70-89 good, 50-69 partial, <50 poor."""

test_messages = [
    {"role": "system", "content": "You are an educational AI assistant."},
    {"role": "user", "content": SIMPLE_EVAL_PROMPT.format(
        question=test_question,
        target=test_target,
        answer=test_answer
    )}
]

print("="*60)
print("TESTING OPENAI MODELS")
print("="*60)
print(f"\nQuestion: {test_question}")
print(f"Answer (truncated): {test_answer[:100]}...\n")

openai_models = [
    {"name": "gpt-4", "display": "GPT-4"},
    {"name": "gpt-4o-mini", "display": "GPT-4o-mini"},
]

openai_results = []

def summarize_rationale(rationale, max_words=200):
    if isinstance(rationale, list):
        rationale_text = " ".join(str(point) for point in rationale)
    else:
        rationale_text = str(rationale)
    words = rationale_text.split()
    return " ".join(words[:max_words])

for model_config in openai_models:
    print(f"\n{'='*60}")
    print(f"Testing: {model_config['display']}")
    print(f"{'='*60}")
    
    result = call_openai_model(model_config["name"], test_messages)
    
    if result["success"]:
        score = extract_score_from_response(result["content"])
        
        print(f"✅ Success")
        print(f"   Score: {score}/100" if score else "   Score: Could not extract")
        print(f"   Latency: {result['latency']:.2f}s")
        print(f"   Tokens: {result['tokens']}")
        print(f"   Cost: ${result['cost']:.6f}")
        
        # Show snippet
        print(f"\n   Response preview:")
        print(f"   {result['content'][:200]}...\n")
        
        # Extract rationale from response (assuming JSON format)
        try:
            response_json = json.loads(result["content"])
            rationale = response_json.get("rationale", "")
            rationale_summary = summarize_rationale(rationale)
        except Exception:
            rationale_summary = "Could not extract rationale."
        print(f"\n   Rationale (summary <200 words):")
        print(f"   {rationale_summary}\n")
        
        openai_results.append({
            "name": model_config["name"],
            "display": model_config["display"],
            "model": model_config["display"],
            "provider": "openai",
            "score": score,
            "latency": result["latency"],
            "tokens": result["tokens"],
            "cost": result["cost"],
            "success": True,
            "evaluation": {"score_0_100": score},
            "full_model": model_config["name"]
        })
    else:
        print(f"❌ Failed: {result.get('error', 'Unknown error')}")
        openai_results.append({
            "name": model_config["name"],
            "display": model_config["display"],
            "model": model_config["display"],
            "provider": "openai",
            "success": False
        })
    
    time.sleep(1)  # Rate limiting

TESTING OPENAI MODELS

Question: Activation Function
Answer (truncated): An activation function is a mathematical function that transforms! each neuron’s aggregated input (p...


Testing: GPT-4


KeyboardInterrupt: 

In [23]:
import plotly.graph_objects as go

def plot_openai_metrics(results):
    models = [r["display"] for r in results if r.get("success")]
    scores = [r["score"] for r in results if r.get("success")]
    latencies = [r["latency"] for r in results if r.get("success")]
    tokens = [r["tokens"] for r in results if r.get("success")]
    costs = [r["cost"] for r in results if r.get("success")]

    # Score chart
    fig_score = go.Figure([go.Bar(x=models, y=scores, marker_color='royalblue')])
    fig_score.update_layout(title="OpenAI Model Scores", xaxis_title="Model", yaxis_title="Score (0-100)", yaxis=dict(range=[0, 100]))
    fig_score.show()

    # Latency chart
    fig_latency = go.Figure([go.Bar(x=models, y=latencies, marker_color='orange')])
    fig_latency.update_layout(title="OpenAI Model Latency", xaxis_title="Model", yaxis_title="Latency (s)")
    fig_latency.show()

    # Tokens chart
    fig_tokens = go.Figure([go.Bar(x=models, y=tokens, marker_color='green')])
    fig_tokens.update_layout(title="OpenAI Model Tokens Used", xaxis_title="Model", yaxis_title="Tokens")
    fig_tokens.show()

    # Cost chart
    fig_cost = go.Figure([go.Bar(x=models, y=costs, marker_color='red')])
    fig_cost.update_layout(title="OpenAI Model Cost", xaxis_title="Model", yaxis_title="Cost (USD)")
    fig_cost.show()

# Usage:
plot_openai_metrics(openai_results)

# ============================================================
# CELL 4: Test HuggingFace Models
# ============================================================

In [21]:
"""
Test HuggingFace models
"""

print("\n" + "="*60)
print("TESTING HUGGINGFACE MODELS")
print("="*60)

hf_models = [
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    "google/gemma-2-9b-it"
]

def test_model_hf(model_name: str, question: str, target: str, answer: str) -> dict:
    """Test a HuggingFace model."""
    
    prompt = f"""You are an expert AI/ML educator evaluating student answers.

**Question:** {question}

**Target Answer:** {target}

**Student Answer:** {answer}

Evaluate on: correctness, completeness, precision.

Respond ONLY with valid JSON (no extra text):
{{
  "score_0_100": <integer>,
  "correctness": "<brief assessment>",
  "completeness": "<brief assessment>",
  "precision": "<brief assessment>",
  "rationale": ["<point 1>", "<point 2>", "<point 3>"]
}}"""
    
    try:
        start_time = time.time()
        
        response = hf_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            model=model_name,
            max_tokens=500,
            temperature=0.3
        )
        
        elapsed = time.time() - start_time
        result_text = response.choices[0].message.content.strip()
        
        # Clean markdown
        if result_text.startswith("```"):
            result_text = result_text.split("```")[1]
            if result_text.startswith("json"):
                result_text = result_text[4:]
            result_text = result_text.rsplit("```", 1)[0]
        
        # Extract JSON
        if "{" in result_text and "}" in result_text:
            start = result_text.find("{")
            end = result_text.rfind("}") + 1
            result_text = result_text[start:end]
        
        evaluation = json.loads(result_text)
        tokens_estimate = len(prompt.split()) * 1.3 + len(result_text.split()) * 1.3
        
        return {
            "model": model_name.split("/")[-1],
            "display": model_name.split("/")[-1],
            "full_model": model_name,
            "provider": "huggingface",
            "success": True,
            "latency": round(elapsed, 2),
            "evaluation": evaluation,
            "score": evaluation["score_0_100"],
            "tokens": int(tokens_estimate),
            "cost": 0.0
        }
    except Exception as e:
        return {
            "model": model_name.split("/")[-1],
            "display": model_name.split("/")[-1],
            "full_model": model_name,
            "provider": "huggingface",
            "success": False,
            "error": str(e)
        }

hf_results = []
for model in hf_models:
    print(f"\n{'='*60}")
    print(f"Testing: {model}")
    print(f"{'='*60}")
    
    result = test_model_hf(model, test_question, test_target, test_answer)
    hf_results.append(result)
    
    if result["success"]:
        print(f"✅ Success")
        print(f"   Score: {result['evaluation']['score_0_100']}/100")
        print(f"   Latency: {result['latency']}s")
        print(f"   Tokens (est): {result['tokens']}")
        print(f"   Cost: FREE")
        print(f"\n   Rationale:")
        for point in result['evaluation']['rationale']:
            print(f"   • {point}")
    else:
        print(f"❌ Failed: {result['error']}")
        print(f"   Note: Model may need time to load (cold start)")
    
    time.sleep(2)


TESTING HUGGINGFACE MODELS

Testing: meta-llama/Meta-Llama-3-8B-Instruct
✅ Success
   Score: 85/100
   Latency: 3.03s
   Tokens (est): 362
   Cost: FREE

   Rationale:
   • The student correctly identifies the main purpose and characteristics of an activation function.
   • However, the explanation lacks some specific examples of activation functions, such as ReLU and its variants.
   • Additionally, the student does not explicitly mention the importance of activation functions in enabling gradient-based training.
   • The statement 'usually differentiable mapping' could be more precise, as some activation functions are not differentiable everywhere.

Testing: Qwen/Qwen2.5-7B-Instruct
✅ Success
   Score: 75/100
   Latency: 1.4s
   Tokens (est): 322
   Cost: FREE

   Rationale:
   • Lacks discussion on the role of activation functions in non-linear modeling and gradient-based training.
   • Does not mention the desirable properties like monotonicity, bounded output range, or sparsity o

# ============================================================
# CELL 5: Visualize Model Comparison (All 5 Models)
# ============================================================

In [None]:
# Combine all successful results
all_successful = []
all_successful.extend([r for r in openai_results if r.get("success")])
all_successful.extend([r for r in hf_results if r.get("success")])

if len(all_successful) == 0:
    print("⚠️ No successful results to visualize")
    print("   Models may be loading. Wait 30s and try again.")
else:
    model_comparison_df = pd.DataFrame([
        {
            "Model": r["model"],
            "Score": r.get("score", r["evaluation"]["score_0_100"]),
            "Latency (s)": r["latency"],
            "Tokens": r["tokens"],
            "Cost ($)": r["cost"],
            "Provider": r["provider"]
        }
        for r in all_successful
    ])
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=("Evaluation Score", "Response Latency", 
                       "Token Usage", "Cost per Evaluation"),
    )
    
    # Score comparison
    fig.add_trace(
        go.Bar(x=model_comparison_df["Model"], y=model_comparison_df["Score"],
               marker_color="lightblue"),
        row=1, col=1
    )
    
    # Latency
    fig.add_trace(
        go.Bar(x=model_comparison_df["Model"], y=model_comparison_df["Latency (s)"],
               marker_color="lightcoral"),
        row=1, col=2
    )
    
    # Tokens
    fig.add_trace(
        go.Bar(x=model_comparison_df["Model"], y=model_comparison_df["Tokens"],
               marker_color="lightgreen"),
        row=2, col=1
    )
    
    # Cost (with small offset for free models)
    costs_display = model_comparison_df["Cost ($)"].replace(0, 0.000001)  # For log scale
    fig.add_trace(
        go.Bar(x=model_comparison_df["Model"], y=costs_display,
               marker_color="lightyellow"),
        row=2, col=2
    )
    
    fig.update_layout(
        height=600, 
        showlegend=False, 
        title_text="Model Comparison: OpenAI vs HuggingFace"
    )
    fig.update_yaxes(title_text="Score (0-100)", row=1, col=1)
    fig.update_yaxes(title_text="Seconds", row=1, col=2)
    fig.update_yaxes(title_text="Tokens", row=2, col=1)
    fig.update_yaxes(title_text="$ (log scale)", type="log", row=2, col=2)
    
    fig.show()
    
    print("\n✅ Model comparison visualization complete")
    print("\n📊 Summary Table:")
    print(model_comparison_df.to_string(index=False))
    
    print(f"""
============================================================
KEY OBSERVATIONS
============================================================

**Quality (Score):**
- Best score: {model_comparison_df.loc[model_comparison_df['Score'].idxmax(), 'Model']} 
  ({model_comparison_df['Score'].max()}/100)

**Cost Efficiency:**
- GPT-4: ~${model_comparison_df[model_comparison_df['Model']=='GPT-4']['Cost ($)'].values[0] if 'GPT-4' in model_comparison_df['Model'].values else 'N/A':.4f} per eval (expensive)
- GPT-4o-mini: ~${model_comparison_df[model_comparison_df['Model']=='GPT-4o-mini']['Cost ($)'].values[0] if 'GPT-4o-mini' in model_comparison_df['Model'].values else 'N/A':.6f} per eval (99% cheaper!)
- HF Models: FREE (but rate limited)

**Speed:**
- Fastest: {model_comparison_df.loc[model_comparison_df['Latency (s)'].idxmin(), 'Model']} 
  ({model_comparison_df['Latency (s)'].min():.2f}s)
- Slowest: {model_comparison_df.loc[model_comparison_df['Latency (s)'].idxmax(), 'Model']} 
  ({model_comparison_df['Latency (s)'].max():.2f}s)

  ## write a recommendation here 
""")


✅ Model comparison visualization complete

📊 Summary Table:
                   Model  Score  Latency (s)  Tokens  Cost ($)    Provider
                   GPT-4     60     5.914223     492   0.01977      openai
             GPT-4o-mini     70     4.153220     477   0.00014      openai
Meta-Llama-3-8B-Instruct     85     3.360000     382   0.00000 huggingface
     Qwen2.5-7B-Instruct     75     1.100000     321   0.00000 huggingface

KEY OBSERVATIONS

**Quality (Score):**
- Best score: Meta-Llama-3-8B-Instruct 
  (85/100)

**Cost Efficiency:**
- GPT-4: ~$0.0198 per eval (expensive)
- GPT-4o-mini: ~$0.000140 per eval (99% cheaper!)
- HF Models: FREE (but rate limited)

**Speed:**
- Fastest: Qwen2.5-7B-Instruct 
  (1.10s)
- Slowest: GPT-4 
  (5.91s)

  ## write a recommendation here 



# ============================================================
# CELL 6: Model Selection Decision
# ============================================================

In [9]:
# Decision matrix: prioritize quality, then cost, then speed
if len(all_successful) > 0:
    # Calculate composite score for each model
    for result in all_successful:
        # Normalize metrics
        score_norm = result.get("score", result["evaluation"]["score_0_100"]) / 100
        
        # Cost score (lower cost = better)
        cost = result["cost"]
        if cost == 0:
            cost_score = 1.0  # Free = perfect
        else:
            cost_score = min(1.0, 0.001 / cost)  # $0.001 as reference
        
        # Speed score (lower latency = better)
        latency = result["latency"]
        latency_score = min(1.0, 2.0 / latency)  # 2s as reference
        
        # Weighted: 50% quality, 30% cost, 20% speed
        result["composite_score"] = (
            0.5 * score_norm +
            0.3 * cost_score +
            0.2 * latency_score
        )
    
    # Select best
    best_result = max(all_successful, key=lambda x: x["composite_score"])
    
    SELECTED_MODEL = best_result["full_model"]
    SELECTED_PROVIDER = best_result["provider"]
    selected_short_name = best_result["model"]
    
    print("\n" + "="*60)
    print(f"SELECTED MODEL: {selected_short_name}")
    print("="*60)
    score_val = best_result.get("score", best_result["evaluation"]["score_0_100"])
    cost_val = best_result["cost"]
    cost_str = "FREE" if cost_val == 0 else f"${cost_val:.6f}"
    
    print(f"""
Full model ID: {SELECTED_MODEL}
Provider: {SELECTED_PROVIDER}

Rationale:
✅ Composite Score: {best_result['composite_score']:.3f}
✅ Quality Score: {score_val}/100
✅ Latency: {best_result['latency']:.2f}s
✅ Cost: {cost_str} per evaluation
✅ Best overall value for educational Q&A

Scoring Breakdown:
• Quality (50% weight): {score_val}/100
• Cost efficiency (30% weight): {cost_str}
• Speed (20% weight): {best_result['latency']:.2f}s

Alternative models tested:
""")
    
    # Show alternatives
    others = sorted(
        [r for r in all_successful if r != best_result],
        key=lambda x: -x["composite_score"]
    )
    
    for i, r in enumerate(others, 1):
        r_score = r.get("score", r["evaluation"]["score_0_100"])
        r_cost = r["cost"]
        r_cost_str = "FREE" if r_cost == 0 else f"${r_cost:.6f}"
        print(f"  {i}. {r['model']}:")
        print(f"     Composite: {r['composite_score']:.3f} | "
              f"Score: {r_score}/100 | "
              f"Latency: {r['latency']:.2f}s | "
              f"Cost: {r_cost_str}")
    
    print(f"\n💡 Decision: Using {selected_short_name} for prompt engineering")
    
else:
    # Fallback
    print("\n" + "="*60)
    print("⚠️ Using Default Model")
    print("="*60)
    SELECTED_MODEL = "gpt-4o-mini"
    SELECTED_PROVIDER = "openai"
    selected_short_name = "GPT-4o-mini"
    print("\nDEFAULT: GPT-4o-mini (best balance)")

print(f"\n✅ Selected: {SELECTED_MODEL}")
print(f"✅ Provider: {SELECTED_PROVIDER}")


SELECTED MODEL: Qwen2.5-7B-Instruct

Full model ID: Qwen/Qwen2.5-7B-Instruct
Provider: huggingface

Rationale:
✅ Composite Score: 0.875
✅ Quality Score: 75/100
✅ Latency: 1.10s
✅ Cost: FREE per evaluation
✅ Best overall value for educational Q&A

Scoring Breakdown:
• Quality (50% weight): 75/100
• Cost efficiency (30% weight): FREE
• Speed (20% weight): 1.10s

Alternative models tested:

  1. Meta-Llama-3-8B-Instruct:
     Composite: 0.844 | Score: 85/100 | Latency: 3.36s | Cost: FREE
  2. GPT-4o-mini:
     Composite: 0.746 | Score: 70/100 | Latency: 4.15s | Cost: $0.000140
  3. GPT-4:
     Composite: 0.383 | Score: 60/100 | Latency: 5.91s | Cost: $0.019770

💡 Decision: Using Qwen2.5-7B-Instruct for prompt engineering

✅ Selected: Qwen/Qwen2.5-7B-Instruct
✅ Provider: huggingface


# ============================================================
# CELL 7: Prompt Engineering - Version 1 (Baseline)
# ============================================================

In [10]:
"""
Prompt Engineering: Testing different prompt formulations
Version 1: Minimal baseline
"""

PROMPT_V1 = """Evaluate this student answer.

Question: {question}
Target: {target}
Answer: {answer}

Score 0-100 and explain. Return JSON with score_0_100, correctness, completeness, precision, rationale."""

def test_prompt_unified(prompt_template: str, question: str, target: str, answer: str, version: str) -> dict:
    """Test a prompt with the selected model."""
    prompt = prompt_template.format(question=question, target=target, answer=answer)
    
    try:
        if SELECTED_PROVIDER == "openai":
            response = openai_client.chat.completions.create(
                model=SELECTED_MODEL,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=500,
                temperature=0.3
            )
            result_text = response.choices[0].message.content.strip()
        else:
            response = hf_client.chat_completion(
                messages=[{"role": "user", "content": prompt}],
                model=SELECTED_MODEL,
                max_tokens=500,
                temperature=0.3
            )
            result_text = response.choices[0].message.content.strip()
        
        # Clean markdown
        if result_text.startswith("```"):
            result_text = result_text.split("```")[1]
            if result_text.startswith("json"):
                result_text = result_text[4:]
            result_text = result_text.rsplit("```", 1)[0]
        
        # Extract JSON
        if "{" in result_text and "}" in result_text:
            start = result_text.find("{")
            end = result_text.rfind("}") + 1
            result_text = result_text[start:end]
        
        evaluation = json.loads(result_text)
        
        return {
            "version": version,
            "success": True,
            "score": evaluation.get("score_0_100"),
            "evaluation": evaluation
        }
    except Exception as e:
        return {
            "version": version,
            "success": False,
            "error": str(e)
        }

print("="*60)
print("PROMPT V1: Baseline")
print("="*60)
print(f"Using: {selected_short_name}\n")

result_v1 = test_prompt_unified(PROMPT_V1, test_question, test_target, test_answer, "V1")
if result_v1["success"]:
    print(f"✅ Score: {result_v1['score']}/100")
else:
    print(f"❌ Failed: {result_v1['error']}")

PROMPT V1: Baseline
Using: Qwen2.5-7B-Instruct

✅ Score: 75/100


# ============================================================
# CELL 8: Prompt Engineering - Version 2 (Structured)
# ============================================================

In [11]:
"""
Version 2: Structured with best practices
"""

PROMPT_V2 = """### ROLE
You are an expert AI/ML educator evaluating student answers.

### TASK
Evaluate the student's answer by comparing it to the target.

### INPUT DATA
**Question:** {question}

**Target Answer:** {target}

**Student Answer:** {answer}

### EVALUATION CRITERIA
1. **Correctness**: Are core concepts accurate?
2. **Completeness**: Are key aspects covered?
3. **Precision**: Is terminology clear?

### SCORING GUIDE
- 90-100: Excellent
- 70-89: Good
- 50-69: Partial
- 0-49: Poor

### OUTPUT FORMAT
Respond ONLY with JSON:

{{
  "score_0_100": <integer>,
  "correctness": "<1-2 sentences>",
  "completeness": "<1-2 sentences>",
  "precision": "<1-2 sentences>",
  "rationale": ["<point>", "<point>", "<point>"]
}}"""

print("\n" + "="*60)
print("PROMPT V2: Structured")
print("="*60)

result_v2 = test_prompt_unified(PROMPT_V2, test_question, test_target, test_answer, "V2")
if result_v2["success"]:
    print(f"✅ Score: {result_v2['score']}/100")
    print(f"   {result_v2['evaluation']['rationale'][0]}")
else:
    print(f"❌ Failed")


PROMPT V2: Structured
✅ Score: 75/100
   The student's answer lacks key details about the activation function's role in gradient-based training and its impact on the stability of learning and representational power.


# ============================================================
# CELL 9: Prompt Engineering - Version 3 (Chain of Thought)
# ============================================================

In [12]:
"""
Version 3: Chain of thought reasoning
"""

PROMPT_V3 = """### ROLE
You are an expert AI/ML educator.

### EVALUATION PROCESS
**Step 1:** Analyze correctness
**Step 2:** Assess completeness
**Step 3:** Evaluate precision
**Step 4:** Assign score

### INPUT
**Question:** {question}
**Target:** {target}
**Student:** {answer}

### SCORING
- 90-100: Excellent
- 70-89: Good
- 50-69: Partial
- 0-49: Poor

### OUTPUT
JSON only:
{{
  "score_0_100": <int>,
  "correctness": "<text>",
  "completeness": "<text>",
  "precision": "<text>",
  "rationale": ["<point>", "<point>", "<point>"]
}}"""

print("\n" + "="*60)
print("PROMPT V3: Chain of Thought")
print("="*60)

result_v3 = test_prompt_unified(PROMPT_V3, test_question, test_target, test_answer, "V3")
if result_v3["success"]:
    print(f"✅ Score: {result_v3['score']}/100")
else:
    print(f"❌ Failed")


PROMPT V3: Chain of Thought
✅ Score: 75/100


# ============================================================
# CELL 10: Prompt Engineering - Version 4 (Optimized)
# ============================================================

In [13]:
"""
Version 4: Optimized final version
"""

PROMPT_V4 = """### ROLE
You are an expert AI/ML educator evaluating student answers.

### TASK
Evaluate on:
1. **Correctness**: Accurate concepts?
2. **Completeness**: Key aspects covered?
3. **Precision**: Clear terminology?

---

### INPUT
**Question:** {question}
**Target:** {target}
**Student:** {answer}

---

### SCORING
- 90-100: Excellent
- 70-89: Good
- 50-69: Partial
- 0-49: Poor

---

### OUTPUT
ONLY valid JSON (no markdown):

{{
  "score_0_100": <integer 0-100>,
  "correctness": "<1-2 sentences>",
  "completeness": "<1-2 sentences>",
  "precision": "<1-2 sentences>",
  "rationale": ["<point>", "<point>", "<point>"]
}}

**IMPORTANT:** Return ONLY the JSON object."""

print("\n" + "="*60)
print("PROMPT V4: Optimized")
print("="*60)

result_v4 = test_prompt_unified(PROMPT_V4, test_question, test_target, test_answer, "V4")
if result_v4["success"]:
    print(f"✅ Score: {result_v4['score']}/100")
    print(f"   Full evaluation:")
    print(f"   - Correctness: {result_v4['evaluation']['correctness']}")
    print(f"   - Completeness: {result_v4['evaluation']['completeness']}")
    print(f"   - Precision: {result_v4['evaluation']['precision']}")
else:
    print(f"❌ Failed")

# Summary
print("\n" + "="*60)
print("PROMPT VERSION SUMMARY")
print("="*60)

prompt_results = [result_v1, result_v2, result_v3, result_v4]
successful_prompts = [r for r in prompt_results if r["success"]]

if successful_prompts:
    print("\n📊 Scores by Version:")
    for r in prompt_results:
        status = "✅" if r["success"] else "❌"
        score = f"{r['score']}/100" if r["success"] else "FAILED"
        print(f"  {status} {r['version']}: {score}")
    
    best_prompt = max(successful_prompts, key=lambda x: x["score"])
    print(f"\n🏆 Best: {best_prompt['version']} ({best_prompt['score']}/100)")
    print(f"\nRecommendation: Use PROMPT_{best_prompt['version']} for production")
else:
    print("\n⚠️ All prompts failed")

print("\n✅ Prompt engineering complete")


PROMPT V4: Optimized
✅ Score: 75/100
   Full evaluation:
   - Correctness: The student's answer is mostly correct, but it lacks the depth and detail provided in the target answer.
   - Completeness: The student covers the key aspects of an activation function, but the explanation is simplified and does not include all the nuances mentioned in the target answer.
   - Precision: The terminology used is mostly precise, but the student could provide more specific examples of activation functions and their properties.

PROMPT VERSION SUMMARY

📊 Scores by Version:
  ✅ V1: 75/100
  ✅ V2: 75/100
  ✅ V3: 75/100
  ✅ V4: 75/100

🏆 Best: V1 (75/100)

Recommendation: Use PROMPT_V1 for production

✅ Prompt engineering complete


# ============================================================
# CELL 11: FIXED - Comprehensive Prompt Testing
# ============================================================
# This cell ACTUALLY tests prompts with different answer qualities

In [14]:
print("="*60)
print("COMPREHENSIVE PROMPT TESTING")
print("="*60)
print(f"\nUsing: {selected_short_name}\n")

# Test cases with different quality levels
test_cases = [
    {
        "name": "Perfect",
        "question": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": qa_db[0]["answer"],
        "expected": (90, 100)
    },
    {
        "name": "Good",
        "question": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": """An activation function applies non-linear transformation to neuron inputs, 
        allowing neural networks to learn complex patterns. Common examples include sigmoid, tanh, 
        and ReLU. Without activation functions, neural networks would be linear models.""",
        "expected": (75, 89)
    },
    {
        "name": "Partial",
        "question": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": "Activation functions are used in neural networks. They help with learning.",
        "expected": (40, 60)
    },
    {
        "name": "Wrong",
        "question": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": "An activation function is a gradient descent optimizer.",
        "expected": (0, 30)
    }
]

prompts_to_test = [
    ("V1", PROMPT_V1),
    ("V2", PROMPT_V2),
    ("V3", PROMPT_V3),
    ("V4", PROMPT_V4)
]

test_results = []

for prompt_name, prompt_template in prompts_to_test:
    print(f"\n{'='*60}")
    print(f"Testing Prompt {prompt_name}")
    print(f"{'='*60}\n")
    
    for test_case in test_cases:
        print(f"  Test: {test_case['name']}")
        
        result = test_prompt_unified(
            prompt_template,
            test_case["question"],
            test_case["target"],
            test_case["answer"],
            prompt_name
        )
        
        if result["success"]:
            score = result["score"]
            exp_min, exp_max = test_case["expected"]
            in_range = exp_min <= score <= exp_max
            status = "✅" if in_range else "⚠️"
            
            print(f"    {status} Score: {score}/100 (expected: {exp_min}-{exp_max})")
            
            test_results.append({
                "prompt": prompt_name,
                "test": test_case["name"],
                "score": score,
                "expected_min": exp_min,
                "expected_max": exp_max,
                "in_range": in_range
            })
        else:
            print(f"    ❌ Failed")
        
        time.sleep(0.5)

# Analysis
df_test = pd.DataFrame(test_results)

print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)

if len(df_test) > 0:
    accuracy = df_test.groupby('prompt').agg({
        'in_range': lambda x: f"{x.sum()}/{len(x)} ({x.mean()*100:.0f}%)"
    })
    
    print("\nCalibration Accuracy:")
    print(accuracy)
    
    print("""
INTERPRETATION:
- Higher accuracy = better calibration
- Look for balance between accuracy and complexity

RECOMMENDATION:
Choose the prompt with highest accuracy.
""")

COMPREHENSIVE PROMPT TESTING

Using: Qwen2.5-7B-Instruct


Testing Prompt V1

  Test: Perfect
    ⚠️ Score: 85/100 (expected: 90-100)
  Test: Good
    ❌ Failed
  Test: Partial
    ❌ Failed
  Test: Wrong
    ❌ Failed

Testing Prompt V2

  Test: Perfect
    ❌ Failed
  Test: Good
    ❌ Failed
  Test: Partial
    ❌ Failed
  Test: Wrong
    ❌ Failed

Testing Prompt V3

  Test: Perfect
    ❌ Failed
  Test: Good
    ❌ Failed
  Test: Partial
    ❌ Failed
  Test: Wrong
    ❌ Failed

Testing Prompt V4

  Test: Perfect
    ❌ Failed
  Test: Good
    ❌ Failed
  Test: Partial
    ❌ Failed
  Test: Wrong
    ❌ Failed

RESULTS SUMMARY

Calibration Accuracy:
        in_range
prompt          
V1      0/1 (0%)

INTERPRETATION:
- Higher accuracy = better calibration
- Look for balance between accuracy and complexity

RECOMMENDATION:
Choose the prompt with highest accuracy.



# ============================================================
# CELL 12: EXPLAINED - Consistency Testing
# ============================================================

In [None]:
"""
## CONSISTENCY TESTING EXPLAINED

**What:** Do we get similar scores evaluating the SAME answer multiple times?
**Why:** LLMs are stochastic (random). We need to verify reliability.
**How:** Evaluate one answer 5 times, calculate variance.
**Good:** Variance ≤5 points (comparable to human graders)
"""

print("="*60)
print("CONSISTENCY TESTING")
print("="*60)
print("\nTesting: Same answer, multiple evaluations\n")

# Test answer
test_ans = """An activation function introduces non-linearity into neural networks by 
transforming the weighted sum of inputs. Common functions include ReLU, sigmoid, and tanh. 
Without activation functions, stacking layers would still result in linear transformation."""

# Use best prompt
BEST_PROMPT = PROMPT_V4
N_REPS = 5

print(f"Running {N_REPS} evaluations...\n")

consistency_scores = []

for i in range(N_REPS):
    result = test_prompt_unified(
        BEST_PROMPT,
        "Activation Function",
        qa_db[0]["answer"],
        test_ans,
        f"consistency_{i}"
    )
    
    if result["success"]:
        score = result["score"]
        consistency_scores.append(score)
        print(f"  Trial {i+1}: {score}/100")
    time.sleep(0.5)

if len(consistency_scores) >= 3:
    mean_score = np.mean(consistency_scores)
    std_score = np.std(consistency_scores)
    min_score = np.min(consistency_scores)
    max_score = np.max(consistency_scores)
    score_range = max_score - min_score
    
    print(f"\n{'='*60}")
    print("CONSISTENCY RESULTS")
    print(f"{'='*60}\n")
    print(f"Scores: {consistency_scores}")
    print(f"\nMean: {mean_score:.1f}/100")
    print(f"Std Dev: {std_score:.2f}")
    print(f"Range: {min_score}-{max_score} ({score_range} points)\n")
    
    if score_range <= 5:
        assessment = "✅ EXCELLENT - Highly consistent"
    elif score_range <= 10:
        assessment = "✅ GOOD - Acceptable consistency"
    else:
        assessment = "⚠️ NEEDS IMPROVEMENT"
    
    print(f"Assessment: {assessment}\n")
    
    print("""
WHAT THIS MEANS:
- Consistency measures RELIABILITY (precision)
- Same input → similar output
- Comparable to human inter-rater reliability
""")

CONSISTENCY TESTING

Testing: Same answer, multiple evaluations

Running 5 evaluations...

  Trial 1: 75/100
  Trial 2: 75/100


# ============================================================
# CELL 13: EXPLAINED - Calibration Testing
# ============================================================

In [17]:
"""
## CALIBRATION TESTING EXPLAINED

**What:** Do scores match actual answer quality?
**Why:** An LLM can be consistent but wrong!
**How:** Test answers of known quality, check if scores align.
**Good:** 90%+ accuracy (scores in expected ranges)
"""

print("="*60)
print("CALIBRATION TESTING")
print("="*60)
print("\nTesting: Do scores match quality levels?\n")

# Calibration test set with known quality
calibration_tests = [
    # Excellent (90-100)
    {
        "category": "Excellent",
        "q": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": qa_db[0]["answer"],
        "expected": (90, 100)
    },
    # Good (75-89)
    {
        "category": "Good",
        "q": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": """Activation functions apply non-linear transformations to neuron inputs. 
        Examples include ReLU, sigmoid, tanh. They're essential because without them, 
        multiple layers would collapse into linear transformation.""",
        "expected": (75, 89)
    },
    # Satisfactory (60-74)
    {
        "category": "Satisfactory",
        "q": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": "Activation functions are mathematical functions in neural networks. "
                 "They help introduce non-linearity. Examples are sigmoid and ReLU.",
        "expected": (60, 74)
    },
    # Poor (0-50)
    {
        "category": "Poor",
        "q": "Activation Function",
        "target": qa_db[0]["answer"],
        "answer": "Activation functions activate the neurons.",
        "expected": (0, 50)
    },
]

calibration_results = []

for i, test in enumerate(calibration_tests, 1):
    print(f"Test {i}: {test['category']} answer")
    
    result = test_prompt_unified(
        BEST_PROMPT,
        test["q"],
        test["target"],
        test["answer"],
        f"cal_{i}"
    )
    
    if result["success"]:
        score = result["score"]
        exp_min, exp_max = test["expected"]
        in_range = exp_min <= score <= exp_max
        status = "✅" if in_range else "❌"
        
        print(f"  {status} Score: {score}/100 (expected: {exp_min}-{exp_max})\n")
        
        calibration_results.append({
            "category": test["category"],
            "score": score,
            "expected_min": exp_min,
            "expected_max": exp_max,
            "in_range": in_range
        })
    
    time.sleep(0.5)

df_cal = pd.DataFrame(calibration_results)

print("="*60)
print("CALIBRATION RESULTS")
print("="*60)

if len(df_cal) > 0:
    accuracy = df_cal['in_range'].mean() * 100
    print(f"\n✅ Calibration Accuracy: {accuracy:.0f}%")
    print(f"   ({df_cal['in_range'].sum()}/{len(df_cal)} in range)\n")
    
    if accuracy >= 90:
        assessment = "✅ EXCELLENT - Scores align with standards"
    elif accuracy >= 75:
        assessment = "✅ GOOD - Acceptable calibration"
    else:
        assessment = "⚠️ NEEDS IMPROVEMENT"
    
    print(f"{assessment}\n")
    
    print("""
WHAT CALIBRATION TELLS US:
- Calibration measures VALIDITY (accuracy)
- Quality level → appropriate score
- Different from consistency!

THE DIFFERENCE:
CONSISTENCY: Same answer → same score (reliability)
CALIBRATION: Score matches quality (accuracy)

NEED BOTH for trustworthy evaluation! ✅
""")

CALIBRATION TESTING

Testing: Do scores match quality levels?

Test 1: Excellent answer
Test 2: Good answer
Test 3: Satisfactory answer
Test 4: Poor answer
CALIBRATION RESULTS


# ============================================================
# FINAL SUMMARY
# ============================================================

In [None]:
print("\n" + "="*60)
print("✅ MODEL BUILD COMPLETE")
print("="*60)

print(f"""
FINAL CONFIGURATION:
====================
Selected Model: {selected_short_name}
Provider: {SELECTED_PROVIDER}
Selected Prompt: {best_prompt['version'] if 'best_prompt' in locals() and best_prompt else 'V4'}

VALIDATION RESULTS:
===================
✅ Model comparison: Tested 5 models
✅ Prompt engineering: Tested 4 versions
✅ Consistency: Tested reliability
✅ Calibration: Tested validity

NEXT STEPS:
===========
1. Copy configuration to model_run.ipynb
2. Implement in model_app.py
3. Deploy to Streamlit
4. Monitor performance with real users

Ready for production! 🚀
""")