# Model Testing - Q&A Evaluator
Testing all core functions with various scenarios

In [1]:
# ============================================================================
# SETUP 
# ============================================================================


In [2]:

import sys
import os
import json

# Ensure model_app.py is importable
sys.path.insert(0, os.getcwd())

from model_app import (
    load_qa_database,
    get_question,
    compute_rouge,
    evaluate_with_llm,
    evaluate_answer,
    analyze_sentiment_llm,
    record_feedback,
    generate_novice_answer,
    FEEDBACK_DB
)

print("‚úÖ All imports successful")


‚úÖ All imports successful


## Test Data Loading 

In [3]:
print("\n" + "="*60)
print("TEST 1: Data Loading")
print("="*60)

qa_db = load_qa_database("Q&A_db_practice.json")
print(f"‚úÖ Loaded {len(qa_db)} questions")
print(f"‚úÖ Sample: {qa_db[0]['question'][:50]}...")

assert len(qa_db) > 0, "Database is empty"
assert "question" in qa_db[0], "Missing 'question' key"
assert "answer" in qa_db[0], "Missing 'answer' key"
print("‚úÖ Data structure validated")



TEST 1: Data Loading
‚úÖ Loaded 150 questions
‚úÖ Sample: Activation Function...
‚úÖ Data structure validated


# ============================================================
# Test Question Selection
# ============================================================


In [4]:
print("\n" + "="*60)
print("TEST 2: Question Selection")
print("="*60)

# Test random selection
q1 = get_question(strategy="random", qa_db=qa_db)
print(f"‚úÖ Random selection:")
print(f"   ID: {q1['question_id']}")
print(f"   Question: {q1['question']}")
print(f"   Target length: {len(q1['target_answer'])} chars")

assert "question_id" in q1
assert "question" in q1
assert "target_answer" in q1
print("‚úÖ Question structure validated")

# Test multiple selections are different (probabilistic)
q2 = get_question(strategy="random", qa_db=qa_db)
print(f"\n‚úÖ Second random selection:")
print(f"   Question: {q2['question']}")



TEST 2: Question Selection
‚úÖ Random selection:
   ID: 342f813d-118d-4653-83cc-3391b3e09629
   Question: Generalization
   Target length: 286 chars
‚úÖ Question structure validated

‚úÖ Second random selection:
   Question: Label Encoding


# ============================================================
# CELL 4: Test ROUGE Metrics
# ============================================================

In [5]:

print("\n" + "="*60)
print("TEST 3: ROUGE Metrics")
print("="*60)

test_cases = [
    {
        "name": "Identical answers",
        "target": "Machine learning is a method of data analysis.",
        "answer": "Machine learning is a method of data analysis.",
        "expected_r1": "> 0.9"
    },
    {
        "name": "Partial overlap",
        "target": "Machine learning is a method of data analysis that automates model building.",
        "answer": "Machine learning automates model building.",
        "expected_r1": "0.4-0.7"
    },
    {
        "name": "No overlap",
        "target": "Machine learning is a method of data analysis.",
        "answer": "I don't know the answer.",
        "expected_r1": "~0.0"
    }
]

for case in test_cases:
    rouge = compute_rouge(case["target"], case["answer"])
    print(f"\n‚úÖ {case['name']}:")
    print(f"   ROUGE-1: {rouge['r1']:.3f}")
    print(f"   ROUGE-2: {rouge['r2']:.3f}")
    print(f"   ROUGE-L: {rouge['rl']:.3f}")
    print(f"   Expected: {case['expected_r1']}")



TEST 3: ROUGE Metrics

‚úÖ Identical answers:
   ROUGE-1: 1.000
   ROUGE-2: 1.000
   ROUGE-L: 1.000
   Expected: > 0.9

‚úÖ Partial overlap:
   ROUGE-1: 0.588
   ROUGE-2: 0.400
   ROUGE-L: 0.588
   Expected: 0.4-0.7

‚úÖ No overlap:
   ROUGE-1: 0.000
   ROUGE-2: 0.000
   ROUGE-L: 0.000
   Expected: ~0.0


# ============================================================
# CELL 5: Test LLM Evaluation
# ============================================================

In [6]:


print("\n" + "="*60)
print("TEST 4: LLM Evaluation")
print("="*60)

# Note: This requires OPENAI_API_KEY environment variable
api_key_status = "‚úÖ Found" if os.getenv("OPENAI_API_KEY") else "‚ùå Missing"
print(f"API Key status: {api_key_status}")

if os.getenv("OPENAI_API_KEY"):
    test_eval = evaluate_with_llm(
        question="What is overfitting?",
        target="Overfitting occurs when a model learns training data too well, including noise and outliers, reducing its ability to generalize to new data.",
        answer="Overfitting is when a model memorizes training data instead of learning patterns."
    )
    
    print(f"\n‚úÖ LLM Evaluation Result:")
    print(f"   Score: {test_eval['score_0_100']}/100")
    print(f"   Correctness: {test_eval['correctness']}")
    print(f"   Completeness: {test_eval['completeness']}")
    print(f"   Precision: {test_eval['precision']}")
    print(f"\n   Rationale:")
    for point in test_eval['rationale']:
        print(f"   ‚Ä¢ {point}")
    
    # Validate structure
    assert 0 <= test_eval['score_0_100'] <= 100
    assert isinstance(test_eval['rationale'], list)
    print("\n‚úÖ LLM evaluation structure validated")
else:
    print("‚ö†Ô∏è Skipping LLM test - set OPENAI_API_KEY to test")



TEST 4: LLM Evaluation
API Key status: ‚úÖ Found

‚úÖ LLM Evaluation Result:
   Score: 50/100
   Correctness: Evaluation failed
   Completeness: System error
   Precision: Could not process

   Rationale:
   ‚Ä¢ Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

‚úÖ LLM evaluation structure validated


# ============================================================
# CELL 6: Test Full Evaluation Pipeline
# ============================================================

In [7]:
print("\n" + "="*60)
print("TEST 5: Full Evaluation Pipeline")
print("="*60)

q = get_question(strategy="random", qa_db=qa_db)
print(f"Question: {q['question']}")

# Test with different answer qualities
answer_scenarios = [
    ("Excellent", q['target_answer']),  # Perfect answer
    ("Good", q['target_answer'][:200]),  # Partial but on-topic
    ("Poor", "I don't know"),  # Minimal
    ("Novice", generate_novice_answer(q['question'], q['target_answer']))
]

results = []
for label, answer in answer_scenarios:
    result = evaluate_answer(
        question=q['question'],
        target=q['target_answer'],
        answer=answer,
        question_id=q['question_id']
    )
    results.append((label, result))
    print(f"\n‚úÖ {label} Answer:")
    print(f"   Answer: {answer[:80]}...")
    print(f"   Final Score: {result['final_score_0_100']}/100")
    print(f"   LLM Score: {result['model_judgment']['score_0_100']}/100")
    print(f"   ROUGE avg: {sum(result['rouge'].values())/3:.3f}")

if os.getenv("OPENAI_API_KEY"):
    excellent_score = results[0][1]['final_score_0_100']
    poor_score = results[2][1]['final_score_0_100']
    print(f"\n‚úÖ Score validation:")
    print(f"   Excellent: {excellent_score}, Poor: {poor_score}")
    assert excellent_score > poor_score, "Scoring logic may be incorrect"
    print("‚úÖ Scoring logic validated")



TEST 5: Full Evaluation Pipeline
Question: Convolutional Neural Network (CNN)

‚úÖ Excellent Answer:
   Answer: A Convolutional Neural Network (CNN) is a feedforward artificial neural network ...
   Final Score: 65/100
   LLM Score: 50/100
   ROUGE avg: 1.000

‚úÖ Good Answer:
   Answer: A Convolutional Neural Network (CNN) is a feedforward artificial neural network ...
   Final Score: 53/100
   LLM Score: 50/100
   ROUGE avg: 0.620

‚úÖ Poor Answer:
   Answer: I don't know...
   Final Score: 35/100
   LLM Score: 50/100
   ROUGE avg: 0.000

‚úÖ Novice Answer:
   Answer: It's related to neural network architecture that in...
   Final Score: 38/100
   LLM Score: 50/100
   ROUGE avg: 0.130

‚úÖ Score validation:
   Excellent: 65, Poor: 35
‚úÖ Scoring logic validated


# ============================================================
# CELL 8: Test Feedback Recording
# ============================================================

In [8]:
# ============================================================
# CELL 8: Test Feedback Recording
# ============================================================

print("\n" + "="*60)
print("TEST 7: Feedback Recording")
print("="*60)

FEEDBACK_DB.clear()

feedback1 = record_feedback(
    eval_id="test-eval-001",
    labels=["useful", "clear"],
    comment="Very helpful explanation!"
)

feedback2 = record_feedback(
    eval_id="test-eval-002",
    labels=["rigorous"],
    comment="The evaluation was too strict."
)

feedback3 = record_feedback(
    eval_id="test-eval-003",
    labels=["relevant"],
    comment=None
)

print(f"‚úÖ Recorded {len(FEEDBACK_DB)} feedback entries")

for i, fb in enumerate(FEEDBACK_DB):
    print(f"\n  Feedback {i+1}:")
    print(f"    ID: {fb['feedback_id']}")
    print(f"    Labels: {fb['labels']}")
    print(f"    Comment: {fb['comment']}")
    print(f"    Sentiment: {fb['sentiment_analysis']['sentiment']}")
    print(f"    Reasoning: {fb['sentiment_analysis']['reasoning']}")

assert len(FEEDBACK_DB) == 3
assert all('sentiment_analysis' in fb for fb in FEEDBACK_DB)
print("\n‚úÖ Feedback recording validated")



TEST 7: Feedback Recording
‚úÖ Recorded 3 feedback entries

  Feedback 1:
    ID: 7a97b791-dc1c-4def-9f4d-72c466ed8e4c
    Labels: ['useful', 'clear']
    Comment: Very helpful explanation!
    Sentiment: neutral
    Reasoning: Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

  Feedback 2:
    ID: f060d600-6629-4cd3-82fd-355fa49110b1
    Labels: ['rigorous']
    Comment: The evaluation was too strict.
    Sentiment: neutral
    Reasoning: Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None,

# ============================================================
# CELL 9: Test Debug Helper
# ============================================================

In [9]:
print("\n" + "="*60)
print("TEST 8: Novice Answer Generation")
print("="*60)

for i in range(3):
    q = get_question(strategy="random", qa_db=qa_db)
    novice = generate_novice_answer(q['question'], q['target_answer'])
    print(f"\n‚úÖ Generation {i+1}:")
    print(f"   Question: {q['question']}")
    print(f"   Novice: {novice}")
    
    assert len(novice) < len(q['target_answer'])

print("\n‚úÖ Novice generation validated")


TEST 8: Novice Answer Generation

‚úÖ Generation 1:
   Question: Manhattan Distance
   Novice: It's related to two vectors by summing the

‚úÖ Generation 2:
   Question: Logistic Regression
   Novice: I'm not sure, but it relates to the concept.

‚úÖ Generation 3:
   Question: Reinforcement Learning
   Novice: I'm not sure, but it relates to the concept.

‚úÖ Novice generation validated


# ============================================================
# CELL 10: Integration Test - Full Loop
# ============================================================

In [10]:
print("\n" + "="*60)
print("TEST 9: Full Integration Loop")
print("="*60)

FEEDBACK_DB.clear()

for cycle in range(3):
    print(f"\n{'='*60}")
    print(f"CYCLE {cycle + 1}")
    print(f"{'='*60}")
    
    q = get_question(strategy="random", qa_db=qa_db)
    print(f"\n1Ô∏è‚É£ Question: {q['question']}")
    
    answer = generate_novice_answer(q['question'], q['target_answer'])
    print(f"\n2Ô∏è‚É£ User Answer: {answer}")
    
    result = evaluate_answer(
        question=q['question'],
        target=q['target_answer'],
        answer=answer,
        question_id=q['question_id']
    )
    print(f"\n3Ô∏è‚É£ Evaluation:")
    print(f"   Final Score: {result['final_score_0_100']}/100")
    print(f"   Key Point: {result['model_judgment']['rationale'][0]}")
    
    feedback_labels = ["useful", "clear"] if result['final_score_0_100'] > 60 else ["unclear"]
    feedback_comment = "Good feedback!" if result['final_score_0_100'] > 60 else "Could be better."
    
    feedback = record_feedback(
        eval_id=result['eval_id'],
        labels=feedback_labels,
        comment=feedback_comment
    )
    print(f"\n4Ô∏è‚É£ Feedback:")
    print(f"   Labels: {feedback['labels']}")
    print(f"   Sentiment: {feedback['sentiment_analysis']['sentiment']}")

print(f"\n{'='*60}")
print("‚úÖ Integration test complete!")
print(f"‚úÖ Total feedback collected: {len(FEEDBACK_DB)}")
print(f"{'='*60}")


TEST 9: Full Integration Loop

CYCLE 1

1Ô∏è‚É£ Question: Ensembles (Stacking).

2Ô∏è‚É£ User Answer: I'm not sure, but it relates to the concept.

3Ô∏è‚É£ Evaluation:
   Final Score: 36/100
   Key Point: Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

4Ô∏è‚É£ Feedback:
   Labels: ['unclear']
   Sentiment: neutral

CYCLE 2

1Ô∏è‚É£ Question: Dropout

2Ô∏è‚É£ User Answer: I believe dropout is a regularization technique that randomly deactivates a subset of a neural network‚Äôs units

3Ô∏è‚É£ Evaluation:
   Final Score: 52/100
   Key Point: Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https:/

# ============================================================
# CELL 11: Summary Report
# ============================================================


In [11]:
print("\n" + "="*60)
print("TEST SUMMARY REPORT")
print("="*60)

tests = [
    "Data Loading",
    "Question Selection",
    "ROUGE Metrics",
    "LLM Evaluation",
    "Full Evaluation Pipeline",
    "Sentiment Analysis (LLM)",
    "Feedback Recording",
    "Novice Answer Generation",
    "Integration Loop"
]

print("\n‚úÖ All tests passed:")
for i, test in enumerate(tests, 1):
    print(f"  {i}. {test}")

print(f"\n{'='*60}")
print("üéâ MODEL TESTING COMPLETE")
print(f"{'='*60}")


TEST SUMMARY REPORT

‚úÖ All tests passed:
  1. Data Loading
  2. Question Selection
  3. ROUGE Metrics
  4. LLM Evaluation
  5. Full Evaluation Pipeline
  6. Sentiment Analysis (LLM)
  7. Feedback Recording
  8. Novice Answer Generation
  9. Integration Loop

üéâ MODEL TESTING COMPLETE
