# Q&A Evaluator - Core Implementation
Assignment 11.02 - LLM Applications


## Required Libraries
Run this cell first to install dependencies:

In [1]:
# !pip install rouge-score openai python-dotenv

import json
import uuid
import os
import random
from datetime import datetime
from typing import Optional

# ROUGE metrics
try:
    from rouge_score import rouge_scorer
    ROUGE_AVAILABLE = True
except ImportError:
    ROUGE_AVAILABLE = False
    print("‚ö†Ô∏è rouge_score not installed. Run: pip install rouge-score")

# LLM client (OpenAI example - adjust for your provider)
try:
    from openai import OpenAI
    LLM_AVAILABLE = True
except ImportError:
    LLM_AVAILABLE = False
    print("‚ö†Ô∏è openai not installed. Run: pip install openai")

print("‚úÖ Imports loaded successfully")



‚úÖ Imports loaded successfully


# ============================================================================
# 1. LOAD Q&A DATABASE
# ============================================================================


In [2]:
def load_qa_database(filepath: str = "Q&A_db_practice.json") -> list[dict]:
    """Load the question-answer database from JSON."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)
    
# Test loading
qa_db = load_qa_database()
print(f"‚úÖ Loaded {len(qa_db)} questions")
print(f"Sample question: {qa_db[0]['question']}")

‚úÖ Loaded 150 questions
Sample question: Activation Function


# ============================================================================
# 2. QUESTION SELECTION
# ============================================================================

In [3]:

def get_question(strategy: str = "random", qa_db: Optional[list] = None) -> dict:
    """
    Select a question from the repository.
    
    Args:
        strategy: Selection method ("random" or "sequential")
        qa_db: Pre-loaded Q&A database (loads if None)
    
    Returns:
        {
            "question_id": "uuid",
            "question": "text",
            "target_answer": "text"
        }
    """
    if qa_db is None:
        qa_db = load_qa_database()
    
    if strategy == "random":
        selected = random.choice(qa_db)
    elif strategy == "sequential":
        selected = qa_db[0]  # Simple version - extend with state tracking
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
    
    return {
        "question_id": str(uuid.uuid4()),
        "question": selected["question"],
        "target_answer": selected["answer"]
    }

# Test
test_q = get_question(qa_db=qa_db)
print(f"‚úÖ Question ID: {test_q['question_id']}")
print(f"‚úÖ Question: {test_q['question']}")


‚úÖ Question ID: f15a3c56-e3e6-4c5b-861e-f7cf3a2d25c3
‚úÖ Question: Recurrent Neural Network (RNN)


In [4]:
# ============================================================================
# 3. ROUGE METRICS
# ============================================================================

In [5]:
def compute_rouge(target: str, answer: str) -> dict:
    """
    Compute ROUGE-1, ROUGE-2, ROUGE-L F1 scores.
    
    Args:
        target: Reference answer
        answer: User's answer
    
    Returns:
        {"r1": float, "r2": float, "rl": float}
    """
    if not ROUGE_AVAILABLE:
        return {"r1": 0.0, "r2": 0.0, "rl": 0.0}
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], 
                                       use_stemmer=True)
    scores = scorer.score(target, answer)
    
    return {
        "r1": round(scores['rouge1'].fmeasure, 3),
        "r2": round(scores['rouge2'].fmeasure, 3),
        "rl": round(scores['rougeL'].fmeasure, 3)
    }

# Test
test_target = "Machine learning is a method of data analysis that automates analytical model building."
test_answer = "Machine learning automates model building using data analysis techniques."
rouge_result = compute_rouge(test_target, test_answer)
print(f"‚úÖ ROUGE scores: {rouge_result}")

‚úÖ ROUGE scores: {'r1': 0.636, 'r2': 0.3, 'rl': 0.455}


# ============================================================================
# 4. LLM-BASED EVALUATION
# ============================================================================

In [6]:
EVALUATION_PROMPT = """You are an expert AI/ML educator evaluating student answers.

**Question:** {question}

**Target Answer:** {target}

**Student Answer:** {answer}

Evaluate the student's answer on three dimensions:
1. **Correctness**: Are the core concepts accurate?
2. **Completeness**: Does it cover key aspects of the target?
3. **Precision**: Is the terminology and explanation clear?

Respond ONLY with valid JSON (no markdown, no extra text):

{{
  "score_0_100": <integer 0-100>,
  "correctness": "<1-2 sentence assessment>",
  "completeness": "<1-2 sentence assessment>",
  "precision": "<1-2 sentence assessment>",
  "rationale": ["<point 1>", "<point 2>", "<point 3>"]
}}

Scoring guide:
- 90-100: Excellent (accurate, comprehensive, precise)
- 70-89: Good (mostly correct, minor gaps)
- 50-69: Partial (some understanding, significant gaps)
- 0-49: Poor (fundamental errors or missing concepts)

Remember: Return ONLY the JSON object, nothing else."""

print("‚úÖ Prompt template defined")

‚úÖ Prompt template defined



# ============================================================================
# 5. MAIN EVALUATION FUNCTION
# ============================================================================


In [7]:
def evaluate_with_llm(question: str, target: str, answer: str) -> dict:
    """
    Use LLM to evaluate answer quality.
    
    Args:
        question: The question text
        target: Target answer
        answer: User's answer
    
    Returns:
        {
            "score_0_100": int,
            "correctness": str,
            "completeness": str,
            "precision": str,
            "rationale": list[str]
        }
    """
    if not LLM_AVAILABLE:
        return {
            "score_0_100": 50,
            "correctness": "LLM not available",
            "completeness": "Cannot assess without LLM",
            "precision": "Fallback mode",
            "rationale": ["LLM client not configured", "Set OPENAI_API_KEY environment variable"]
        }
    
    # Get API key from environment
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        return {
            "score_0_100": 50,
            "correctness": "API key missing",
            "completeness": "Set OPENAI_API_KEY",
            "precision": "Cannot evaluate",
            "rationale": ["Set environment variable: OPENAI_API_KEY"]
        }
    
    client = OpenAI(api_key=api_key)
    
    prompt = EVALUATION_PROMPT.format(
        question=question,
        target=target,
        answer=answer
    )
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a precise evaluator. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=500
        )
        
        result_text = response.choices[0].message.content.strip()
        
        # Clean potential markdown
        if result_text.startswith("```"):
            result_text = result_text.split("```")[1]
            if result_text.startswith("json"):
                result_text = result_text[4:]
            result_text = result_text.rsplit("```", 1)[0]
        
        evaluation = json.loads(result_text)
        
        # Validate required keys
        required = {"score_0_100", "correctness", "completeness", "precision", "rationale"}
        if not required.issubset(evaluation.keys()):
            raise ValueError("Missing required keys in LLM response")
        
        return evaluation
        
    except Exception as e:
        print(f"‚ùå LLM evaluation error: {e}")
        return {
            "score_0_100": 50,
            "correctness": "Evaluation failed",
            "completeness": "System error",
            "precision": "Could not process",
            "rationale": [f"Error: {str(e)}"]
        }

# Test (will use fallback if no API key)
test_eval = evaluate_with_llm(
    question="What is overfitting?",
    target="Overfitting occurs when a model learns the training data too well, including noise, reducing generalization.",
    answer="It's when the model memorizes training data."
)
print(f"‚úÖ LLM evaluation: {test_eval['score_0_100']}/100")

‚ùå LLM evaluation error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
‚úÖ LLM evaluation: 50/100


# ============================================================
# 6.  Main Evaluation Function
# ============================================================

In [8]:
def evaluate_answer(
    question: str, 
    target: str, 
    answer: str, 
    *, 
    rouge: bool = True,
    question_id: Optional[str] = None
) -> dict:
    """
    Comprehensive answer evaluation combining LLM judgment and ROUGE metrics.
    
    Args:
        question: Question text
        target: Reference answer
        answer: User's submitted answer
        rouge: Whether to compute ROUGE scores
        question_id: Optional question identifier
    
    Returns:
        {
            "eval_id": "uuid",
            "question_id": "string",
            "model_judgment": {...},
            "rouge": {"r1": float, "r2": float, "rl": float},
            "final_score_0_100": int,
            "timestamp": "iso-datetime"
        }
    """
    eval_id = str(uuid.uuid4())
    
    # 1. LLM evaluation
    llm_judgment = evaluate_with_llm(question, target, answer)
    
    # 2. ROUGE metrics
    rouge_scores = compute_rouge(target, answer) if rouge else {"r1": 0.0, "r2": 0.0, "rl": 0.0}
    
    # 3. Combined score (70% LLM, 30% ROUGE)
    rouge_avg = (rouge_scores["r1"] + rouge_scores["r2"] + rouge_scores["rl"]) / 3
    final_score = int(0.7 * llm_judgment["score_0_100"] + 0.3 * rouge_avg * 100)
    
    return {
        "eval_id": eval_id,
        "question_id": question_id or "unknown",
        "model_judgment": llm_judgment,
        "rouge": rouge_scores,
        "final_score_0_100": final_score,
        "timestamp": datetime.now().isoformat()
    }

# Test
test_result = evaluate_answer(
    question=test_q["question"],
    target=test_q["target_answer"],
    answer="It is a technique used in machine learning.",
    question_id=test_q["question_id"]
)
print(f"‚úÖ Evaluation complete: {test_result['final_score_0_100']}/100")


‚ùå LLM evaluation error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
‚úÖ Evaluation complete: 36/100


# ============================================================
# CELL 8: Sentiment Analysis via LLM
# ============================================================

In [9]:
SENTIMENT_PROMPT = """Analyze the sentiment of this user feedback comment.

**Comment:** {comment}

Classify the sentiment as one of: positive, negative, or neutral.

Respond ONLY with valid JSON (no markdown):

{{
  "sentiment": "<positive|negative|neutral>",
  "confidence": <float 0.0-1.0>,
  "reasoning": "<1 sentence explanation>"
}}"""

def analyze_sentiment_llm(comment: Optional[str]) -> dict:
    """
    Analyze sentiment of user comment using LLM.
    
    Args:
        comment: User's feedback text
    
    Returns:
        {
            "sentiment": "positive|negative|neutral",
            "confidence": float,
            "reasoning": str
        }
    """
    if not comment:
        return {
            "sentiment": "neutral",
            "confidence": 1.0,
            "reasoning": "No comment provided"
        }
    
    if not LLM_AVAILABLE:
        return {
            "sentiment": "neutral",
            "confidence": 0.0,
            "reasoning": "LLM not available for sentiment analysis"
        }
    
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        return {
            "sentiment": "neutral",
            "confidence": 0.0,
            "reasoning": "API key not configured"
        }
    
    client = OpenAI(api_key=api_key)
    
    prompt = SENTIMENT_PROMPT.format(comment=comment)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a sentiment analysis expert. Return only JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=150
        )
        
        result_text = response.choices[0].message.content.strip()
        
        # Clean markdown
        if result_text.startswith("```"):
            result_text = result_text.split("```")[1]
            if result_text.startswith("json"):
                result_text = result_text[4:]
            result_text = result_text.rsplit("```", 1)[0]
        
        sentiment = json.loads(result_text)
        
        # Validate
        if "sentiment" not in sentiment:
            raise ValueError("Missing sentiment field")
        
        return sentiment
        
    except Exception as e:
        print(f"‚ùå Sentiment analysis error: {e}")
        return {
            "sentiment": "neutral",
            "confidence": 0.0,
            "reasoning": f"Error: {str(e)}"
        }

# Test
test_sentiment = analyze_sentiment_llm("This evaluation was very helpful and clear!")
print(f"‚úÖ Sentiment: {test_sentiment['sentiment']} (confidence: {test_sentiment.get('confidence', 0)})")

‚ùå Sentiment analysis error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
‚úÖ Sentiment: neutral (confidence: 0.0)


# ============================================================
# 9. Feedback Recording Function
# ============================================================

In [10]:
FEEDBACK_DB = []  # In-memory storage

def record_feedback(
    eval_id: str, 
    labels: list[str], 
    comment: Optional[str] = None
) -> dict:
    """
    Record user feedback on an evaluation.
    
    Args:
        eval_id: Evaluation identifier
        labels: Selected feedback labels
        comment: Optional free-text feedback
    
    Returns:
        {
            "feedback_id": "uuid",
            "eval_id": str,
            "labels": list[str],
            "comment": str,
            "sentiment_analysis": dict,
            "timestamp": str
        }
    """
    feedback_id = str(uuid.uuid4())
    
    # Use LLM for sentiment analysis
    sentiment = analyze_sentiment_llm(comment)
    
    feedback_entry = {
        "feedback_id": feedback_id,
        "eval_id": eval_id,
        "labels": labels,
        "comment": comment,
        "sentiment_analysis": sentiment,
        "timestamp": datetime.now().isoformat()
    }
    
    FEEDBACK_DB.append(feedback_entry)
    
    return feedback_entry

# Test
test_feedback = record_feedback(
    eval_id=test_result["eval_id"],
    labels=["useful", "clear"],
    comment="Very helpful explanation of my mistakes!"
)
print(f"‚úÖ Feedback recorded: {test_feedback['sentiment_analysis']['sentiment']}")
print(f"   Reasoning: {test_feedback['sentiment_analysis']['reasoning']}")

‚ùå Sentiment analysis error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
‚úÖ Feedback recorded: neutral
   Reasoning: Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}


# ============================================================
# 10. Debug Helper: Generate Novice Answer
# ============================================================

In [11]:
def generate_novice_answer(question: str, target: str) -> str:
    """
    Generate a simplified/incomplete answer for testing.
    
    Args:
        question: The question
        target: Target answer
    
    Returns:
        Simulated novice answer
    """
    # Extract first sentence or first 100 chars
    first_part = target.split('.')[0] if '.' in target else target[:100]
    
    templates = [
        f"{first_part}... I think.",
        f"I believe {first_part.lower()}",
        f"It's related to {' '.join(first_part.split()[-5:])}",
        "I'm not completely sure, but it has something to do with the concept mentioned."
    ]
    
    return random.choice(templates)

# Test
novice = generate_novice_answer(test_q["question"], test_q["target_answer"])
print(f"‚úÖ Generated novice answer: {novice}")

‚úÖ Generated novice answer: A recurrent neural network (RNN) is a type of artificial neural network that processes sequential da... I think.


# ============================================================
# 11. Full Pipeline Test
# ============================================================

In [12]:
print("\n" + "="*60)
print("FULL PIPELINE TEST")
print("="*60)

# 1. Get question
q = get_question(strategy="random", qa_db=qa_db)
print(f"\nüìù Question: {q['question']}")

# 2. Simulate answer (debug mode)
user_answer = generate_novice_answer(q['question'], q['target_answer'])
print(f"\nüí≠ Simulated Answer: {user_answer}")

# 3. Evaluate
print("\n‚è≥ Evaluating...")
result = evaluate_answer(
    question=q['question'],
    target=q['target_answer'],
    answer=user_answer,
    question_id=q['question_id']
)

print(f"\nüìä Results:")
print(f"  Final Score: {result['final_score_0_100']}/100")
print(f"  LLM Score: {result['model_judgment']['score_0_100']}/100")
print(f"  ROUGE-1: {result['rouge']['r1']}")
print(f"  ROUGE-2: {result['rouge']['r2']}")
print(f"  ROUGE-L: {result['rouge']['rl']}")
print(f"\nüí° Rationale:")
for point in result['model_judgment']['rationale']:
    print(f"  ‚Ä¢ {point}")

# 4. Collect feedback
feedback = record_feedback(
    eval_id=result['eval_id'],
    labels=["useful", "rigorous"],
    comment="The evaluation helped me understand where I went wrong."
)

print(f"\n‚úÖ Feedback:")
print(f"  Sentiment: {feedback['sentiment_analysis']['sentiment']}")
print(f"  Reasoning: {feedback['sentiment_analysis']['reasoning']}")

print("\n" + "="*60)



FULL PIPELINE TEST

üìù Question: Data Science

üí≠ Simulated Answer: I believe data science is an interdisciplinary practice that extracts knowledge and actionable insights from s

‚è≥ Evaluating...
‚ùå LLM evaluation error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

üìä Results:
  Final Score: 50/100
  LLM Score: 50/100
  ROUGE-1: 0.52
  ROUGE-2: 0.5
  ROUGE-L: 0.52

üí° Rationale:
  ‚Ä¢ Error: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
‚ùå Sentiment a

# ============================================================
# 12. Export to model_app.py
# ============================================================

In [13]:
%%writefile model_app.py
"""
Q&A Evaluator Core Logic - Production Module
Assignment 11.02 - LLM Applications

This module provides core functions for:
- Question selection from repository
- Answer evaluation using LLM + ROUGE metrics
- Feedback collection with sentiment analysis
"""

import json
import uuid
import os
import random
from datetime import datetime
from typing import Optional

# ROUGE metrics
try:
    from rouge_score import rouge_scorer
    ROUGE_AVAILABLE = True
except ImportError:
    ROUGE_AVAILABLE = False

# LLM client
try:
    from openai import OpenAI
    LLM_AVAILABLE = True
except ImportError:
    LLM_AVAILABLE = False


# ============================================================
# DATA LOADING
# ============================================================

def load_qa_database(filepath: str = "Q&A_db_practice.json") -> list[dict]:
    """Load the question-answer database from JSON."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)


# ============================================================
# QUESTION SELECTION
# ============================================================

def get_question(strategy: str = "random", qa_db: Optional[list] = None) -> dict:
    """
    Select a question from the repository.
    
    Args:
        strategy: Selection method ("random" or "sequential")
        qa_db: Pre-loaded Q&A database
    
    Returns:
        dict with question_id, question, target_answer
    """
    if qa_db is None:
        qa_db = load_qa_database()
    
    if strategy == "random":
        selected = random.choice(qa_db)
    elif strategy == "sequential":
        selected = qa_db[0]
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
    
    return {
        "question_id": str(uuid.uuid4()),
        "question": selected["question"],
        "target_answer": selected["answer"]
    }


# ============================================================
# ROUGE METRICS
# ============================================================

def compute_rouge(target: str, answer: str) -> dict:
    """
    Compute ROUGE-1, ROUGE-2, ROUGE-L F1 scores.
    
    Returns:
        dict with r1, r2, rl keys
    """
    if not ROUGE_AVAILABLE:
        return {"r1": 0.0, "r2": 0.0, "rl": 0.0}
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], 
                                       use_stemmer=True)
    scores = scorer.score(target, answer)
    
    return {
        "r1": round(scores['rouge1'].fmeasure, 3),
        "r2": round(scores['rouge2'].fmeasure, 3),
        "rl": round(scores['rougeL'].fmeasure, 3)
    }


# ============================================================
# LLM EVALUATION
# ============================================================

EVALUATION_PROMPT = """You are an expert AI/ML educator evaluating student answers.

**Question:** {question}

**Target Answer:** {target}

**Student Answer:** {answer}

Evaluate the student's answer on three dimensions:
1. **Correctness**: Are the core concepts accurate?
2. **Completeness**: Does it cover key aspects of the target?
3. **Precision**: Is the terminology and explanation clear?

Respond ONLY with valid JSON (no markdown, no extra text):

{{
  "score_0_100": <integer 0-100>,
  "correctness": "<1-2 sentence assessment>",
  "completeness": "<1-2 sentence assessment>",
  "precision": "<1-2 sentence assessment>",
  "rationale": ["<point 1>", "<point 2>", "<point 3>"]
}}

Scoring guide:
- 90-100: Excellent (accurate, comprehensive, precise)
- 70-89: Good (mostly correct, minor gaps)
- 50-69: Partial (some understanding, significant gaps)
- 0-49: Poor (fundamental errors or missing concepts)

Remember: Return ONLY the JSON object, nothing else."""


def evaluate_with_llm(question: str, target: str, answer: str) -> dict:
    """Use LLM to evaluate answer quality."""
    if not LLM_AVAILABLE:
        return {
            "score_0_100": 50,
            "correctness": "LLM not available",
            "completeness": "Cannot assess",
            "precision": "Fallback mode",
            "rationale": ["LLM client not configured"]
        }
    
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        return {
            "score_0_100": 50,
            "correctness": "API key missing",
            "completeness": "Set OPENAI_API_KEY",
            "precision": "Cannot evaluate",
            "rationale": ["Environment variable OPENAI_API_KEY required"]
        }
    
    client = OpenAI(api_key=api_key)
    prompt = EVALUATION_PROMPT.format(question=question, target=target, answer=answer)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a precise evaluator. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=500
        )
        
        result_text = response.choices[0].message.content.strip()
        
        # Clean markdown if present
        if result_text.startswith("```"):
            result_text = result_text.split("```")[1]
            if result_text.startswith("json"):
                result_text = result_text[4:]
            result_text = result_text.rsplit("```", 1)[0]
        
        evaluation = json.loads(result_text)
        
        required = {"score_0_100", "correctness", "completeness", "precision", "rationale"}
        if not required.issubset(evaluation.keys()):
            raise ValueError("Missing required keys")
        
        return evaluation
        
    except Exception as e:
        return {
            "score_0_100": 50,
            "correctness": "Evaluation failed",
            "completeness": "System error",
            "precision": "Could not process",
            "rationale": [f"Error: {str(e)}"]
        }


# ============================================================
# MAIN EVALUATION
# ============================================================

def evaluate_answer(
    question: str, 
    target: str, 
    answer: str, 
    *, 
    rouge: bool = True,
    question_id: Optional[str] = None
) -> dict:
    """
    Comprehensive answer evaluation.
    
    Returns:
        dict with eval_id, question_id, model_judgment, rouge, final_score_0_100, timestamp
    """
    eval_id = str(uuid.uuid4())
    
    llm_judgment = evaluate_with_llm(question, target, answer)
    rouge_scores = compute_rouge(target, answer) if rouge else {"r1": 0.0, "r2": 0.0, "rl": 0.0}
    
    rouge_avg = (rouge_scores["r1"] + rouge_scores["r2"] + rouge_scores["rl"]) / 3
    final_score = int(0.7 * llm_judgment["score_0_100"] + 0.3 * rouge_avg * 100)
    
    return {
        "eval_id": eval_id,
        "question_id": question_id or "unknown",
        "model_judgment": llm_judgment,
        "rouge": rouge_scores,
        "final_score_0_100": final_score,
        "timestamp": datetime.now().isoformat()
    }


# ============================================================
# SENTIMENT ANALYSIS
# ============================================================

SENTIMENT_PROMPT = """Analyze the sentiment of this user feedback comment.

**Comment:** {comment}

Classify the sentiment as one of: positive, negative, or neutral.

Respond ONLY with valid JSON (no markdown):

{{
  "sentiment": "<positive|negative|neutral>",
  "confidence": <float 0.0-1.0>,
  "reasoning": "<1 sentence explanation>"
}}"""


def analyze_sentiment_llm(comment: Optional[str]) -> dict:
    """Analyze sentiment using LLM."""
    if not comment:
        return {
            "sentiment": "neutral",
            "confidence": 1.0,
            "reasoning": "No comment provided"
        }
    
    if not LLM_AVAILABLE:
        return {
            "sentiment": "neutral",
            "confidence": 0.0,
            "reasoning": "LLM not available"
        }
    
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        return {
            "sentiment": "neutral",
            "confidence": 0.0,
            "reasoning": "API key not configured"
        }
    
    client = OpenAI(api_key=api_key)
    prompt = SENTIMENT_PROMPT.format(comment=comment)
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a sentiment analysis expert. Return only JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=150
        )
        
        result_text = response.choices[0].message.content.strip()
        
        if result_text.startswith("```"):
            result_text = result_text.split("```")[1]
            if result_text.startswith("json"):
                result_text = result_text[4:]
            result_text = result_text.rsplit("```", 1)[0]
        
        sentiment = json.loads(result_text)
        
        if "sentiment" not in sentiment:
            raise ValueError("Missing sentiment field")
        
        return sentiment
        
    except Exception as e:
        return {
            "sentiment": "neutral",
            "confidence": 0.0,
            "reasoning": f"Error: {str(e)}"
        }


# ============================================================
# FEEDBACK RECORDING
# ============================================================

FEEDBACK_DB = []


def record_feedback(
    eval_id: str, 
    labels: list[str], 
    comment: Optional[str] = None
) -> dict:
    """
    Record user feedback with LLM sentiment analysis.
    
    Returns:
        dict with feedback_id, eval_id, labels, comment, sentiment_analysis, timestamp
    """
    feedback_id = str(uuid.uuid4())
    sentiment = analyze_sentiment_llm(comment)
    
    feedback_entry = {
        "feedback_id": feedback_id,
        "eval_id": eval_id,
        "labels": labels,
        "comment": comment,
        "sentiment_analysis": sentiment,
        "timestamp": datetime.now().isoformat()
    }
    
    FEEDBACK_DB.append(feedback_entry)
    return feedback_entry

# ============================================================
# DEBUG UTILITIES
# ============================================================

def generate_novice_answer(question: str, target: str) -> str:
    """Generate a simplified answer for testing."""
    first_part = target.split('.')[0] if '.' in target else target[:100]
    templates = [
        f"{first_part}... I think.",
        f"I believe {first_part.lower()}",
        f"It's related to {' '.join(first_part.split()[-5:])}",
        "I'm not sure, but it relates to the concept."
    ]
    return random.choice(templates)

Overwriting model_app.py


In [14]:
import os
if os.path.exists("model_app.py"):
    print("‚úÖ model_app.py created successfully!")
    with open("model_app.py", "r") as f:
        lines = f.readlines()
        print(f"‚úÖ File contains {len(lines)} lines")
else:
    print("‚ùå model_app.py was NOT created")

‚úÖ model_app.py created successfully!
‚úÖ File contains 360 lines
