# Capability Score Evaluation

Compute capability scores for each model/method using LLM-as-a-Judge.
This is a sanity check - no gap closure calculation, just individual scores.

In [None]:
"""
Capability Score Evaluation using LLM-as-Judge
"""

import json
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# OpenAI installs
%pip install openai

import openai
from openai import OpenAI
import time
from tqdm import tqdm

In [None]:
# API Key
OPENAI_API_KEY = ""

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

## Configuration Toggles

Toggle which inference runs to evaluate. Set to `True` to include in evaluation.

In [None]:
# ============================================================================
# TOGGLES - Select which inference runs to evaluate
# ============================================================================

# Base model inference
EVAL_BASE_MODEL = True

# Fine-tuning training data (the target distribution)
EVAL_TRAINING_DATA_KIDMODE = True
EVAL_TRAINING_DATA_SHORT = True

# Neologism inference
EVAL_NEOLOGISM_KIDMODE = True
EVAL_NEOLOGISM_SHORT = True
EVAL_NEOLOGISM_COMBINED = True  # Combined/composition inference

# Fine-tuning inference (LoRA)
EVAL_FINETUNING_KIDMODE = True
EVAL_FINETUNING_SHORT = False  # Set to True when available

# Prompting with synthetic data prompts
EVAL_PROMPTING_KIDMODE = True
EVAL_PROMPTING_SHORT = True

In [None]:
# ============================================================================
# FILE PATHS
# ============================================================================

# Base model inference
BASE_MODEL_FILE = "../inference/base/base_mistral_inference_results.jsonl"

# Training data files (fine-tuning format)
TRAINING_DATA_KIDMODE_FILE = "../data-prep/kidmode/kidmode_ft.jsonl"
TRAINING_DATA_SHORT_FILE = "../data-prep/short/short_ft.jsonl"

# Neologism inference results
NEOLOGISM_KIDMODE_FILE = "../inference/neologism/kidmode/mistral_with_kidmode_inference_results.jsonl"
NEOLOGISM_SHORT_FILE = "../inference/neologism/short/mistral_with_short_inference_results.jsonl"
NEOLOGISM_COMBINED_FILE = "../inference/composition_inference_results.jsonl"

# Fine-tuning inference results
FINETUNING_KIDMODE_FILE = "../inference/fine-tuning/kidmode_rank8_inference_results.jsonl"
FINETUNING_SHORT_FILE = ""  # Update when available

# Prompting inference results
PROMPTING_KIDMODE_FILE = "../inference/prompting/prompting_kidmode_inference.jsonl"
PROMPTING_SHORT_FILE = "../inference/prompting/prompting_short_inference.jsonl"

In [None]:
# ============================================================================
# SAMPLE SIZE CONFIGURATION
# ============================================================================

# Number of examples to score (to save API costs)
SAMPLE_SIZE = 100

## LLM Judge Functions

In [None]:
def judge_short_score(instruction, response):
    """
    Score a response for brevity/conciseness using LLM-as-judge (OpenAI).
    Returns score from 1-10.
    
    TODO: Fill in the specific judging prompt for the 'short' concept.
    """
    
    judging_prompt = f"""Rate how correct and relevant this response is on a scale from 1-10.

    A score of 10 means:
    - The response is factually accurate
    - The response directly answers the question

    A score of 1 means:
    - The response is factually incorrect
    - The response discusses things other than the question posed

Instruction: {instruction}

Response: {response}

Provide ONLY a single number from 1-10, with no explanation:"""
    
    completion = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[
            {"role": "user", "content": judging_prompt}
        ],
        max_tokens=10,
        temperature=0
    )
    
    score_text = completion.choices[0].message.content.strip()
    
    import re
    match = re.search(r'\d+', score_text)
    if match:
        score = float(match.group())
        return min(max(score, 1), 10)
    else:
        raise ValueError(f"Could not parse score from: {score_text}")

## Scoring Functions

In [None]:
import random

def score_inference_file(filepath, judge_fn, sample_size=SAMPLE_SIZE, response_key='response', prompt_key='prompt'):
    """
    Load an inference results file and score with LLM judge.
    
    Args:
        filepath: Path to JSONL file
        judge_fn: Judge function to use (judge_kidmode_score, judge_short_score, etc.)
        sample_size: Number of examples to score (to save API costs)
        response_key: Key for response in JSONL (default 'response')
        prompt_key: Key for prompt/instruction in JSONL (default 'prompt')
    """
    print(f"Loading {filepath}...")
    
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    print(f"  Loaded {len(data)} examples")
    
    # Sample for scoring
    if len(data) > sample_size:
        print(f"  Sampling {sample_size} examples for scoring")
        sampled_data = random.sample(data, sample_size)
    else:
        sampled_data = data
    
    print(f"  Scoring {len(sampled_data)} examples...")
    
    scores = []
    for i, ex in enumerate(sampled_data):
        try:
            prompt = ex.get(prompt_key, ex.get('instruction', ''))
            response = ex.get(response_key, ex.get('chosen', ''))
            
            score = judge_fn(prompt, response)
            scores.append(score)
            
            if (i + 1) % 20 == 0:
                print(f"    Progress: {i+1}/{len(sampled_data)} | Current avg: {np.mean(scores):.2f}")
        except Exception as e:
            print(f"    Warning: Failed to score example {i}: {e}")
            continue
    
    stats = {
        'mean': np.mean(scores),
        'median': np.median(scores),
        'std': np.std(scores),
        'count': len(scores),
        'scores': scores
    }
    
    print(f"  Mean score: {stats['mean']:.2f} (std: {stats['std']:.2f})\n")
    
    return stats


def score_training_data(filepath, judge_fn, sample_size=SAMPLE_SIZE):
    """
    Load training data and score with LLM judge.
    Training data uses 'chosen' instead of 'response'.
    """
    return score_inference_file(
        filepath, 
        judge_fn, 
        sample_size=sample_size,
        response_key='chosen',
        prompt_key='prompt'
    )

## Run Evaluations

In [None]:
# ============================================================================
# RUN EVALUATIONS
# ============================================================================

results = {}

print("="*70)
print("CAPABILITY SCORE EVALUATION")
print("="*70 + "\n")

In [None]:
# Base Model (evaluate for both concepts)
if EVAL_BASE_MODEL:
    print("### BASE MODEL INFERENCE ###")
    print("-" * 40)
    
    print("\nScoring for KIDMODE concept:")
    results['base_model_kidmode'] = score_inference_file(
        BASE_MODEL_FILE, 
        judge_kidmode_score
    )
    
    print("Scoring for SHORT concept:")
    results['base_model_short'] = score_inference_file(
        BASE_MODEL_FILE, 
        judge_short_score
    )

In [None]:
# Training Data - Kidmode
if EVAL_TRAINING_DATA_KIDMODE:
    print("### TRAINING DATA - KIDMODE ###")
    print("-" * 40)
    
    results['training_data_kidmode'] = score_training_data(
        TRAINING_DATA_KIDMODE_FILE,
        judge_kidmode_score
    )

In [None]:
# Training Data - Short
if EVAL_TRAINING_DATA_SHORT:
    print("### TRAINING DATA - SHORT ###")
    print("-" * 40)
    
    results['training_data_short'] = score_training_data(
        TRAINING_DATA_SHORT_FILE,
        judge_short_score
    )

In [None]:
# Neologism Inference - Kidmode
if EVAL_NEOLOGISM_KIDMODE:
    print("### NEOLOGISM INFERENCE - KIDMODE ###")
    print("-" * 40)
    
    results['neologism_kidmode'] = score_inference_file(
        NEOLOGISM_KIDMODE_FILE,
        judge_kidmode_score
    )

In [None]:
# Neologism Inference - Short
if EVAL_NEOLOGISM_SHORT:
    print("### NEOLOGISM INFERENCE - SHORT ###")
    print("-" * 40)
    
    results['neologism_short'] = score_inference_file(
        NEOLOGISM_SHORT_FILE,
        judge_short_score
    )

In [None]:
# Neologism Inference - Combined
if EVAL_NEOLOGISM_COMBINED:
    print("### NEOLOGISM INFERENCE - COMBINED ###")
    print("-" * 40)
    
    # Score for kidmode
    print("\nScoring for KIDMODE concept:")
    results['neologism_combined_kidmode'] = score_inference_file(
        NEOLOGISM_COMBINED_FILE,
        judge_kidmode_score
    )
    
    # Score for short
    print("Scoring for SHORT concept:")
    results['neologism_combined_short'] = score_inference_file(
        NEOLOGISM_COMBINED_FILE,
        judge_short_score
    )
    
    # Score for combined
    print("Scoring for COMBINED concept:")
    results['neologism_combined_both'] = score_inference_file(
        NEOLOGISM_COMBINED_FILE,
        judge_combined_score
    )

In [None]:
# Fine-tuning Inference - Kidmode
if EVAL_FINETUNING_KIDMODE:
    print("### FINE-TUNING INFERENCE - KIDMODE ###")
    print("-" * 40)
    
    results['finetuning_kidmode'] = score_inference_file(
        FINETUNING_KIDMODE_FILE,
        judge_kidmode_score
    )

In [None]:
# Fine-tuning Inference - Short
if EVAL_FINETUNING_SHORT:
    print("### FINE-TUNING INFERENCE - SHORT ###")
    print("-" * 40)
    
    results['finetuning_short'] = score_inference_file(
        FINETUNING_SHORT_FILE,
        judge_short_score
    )

In [None]:
# Prompting - Kidmode
if EVAL_PROMPTING_KIDMODE:
    print("### PROMPTING - KIDMODE ###")
    print("-" * 40)
    
    results['prompting_kidmode'] = score_inference_file(
        PROMPTING_KIDMODE_FILE,
        judge_kidmode_score
    )

In [None]:
# Prompting - Short
if EVAL_PROMPTING_SHORT:
    print("### PROMPTING - SHORT ###")
    print("-" * 40)
    
    results['prompting_short'] = score_inference_file(
        PROMPTING_SHORT_FILE,
        judge_short_score
    )

## Results Summary

In [None]:
# ============================================================================
# SUMMARY TABLE
# ============================================================================

print("\n" + "="*70)
print("CAPABILITY SCORE SUMMARY")
print("="*70)

print(f"\n{'Method':<40} {'Mean':>8} {'Median':>8} {'Std':>8} {'N':>6}")
print("-"*70)

for name, stats in results.items():
    print(f"{name:<40} {stats['mean']:>8.2f} {stats['median']:>8.2f} {stats['std']:>8.2f} {stats['count']:>6}")

print("="*70)

In [None]:
# ============================================================================
# VISUALIZATION
# ============================================================================

if len(results) > 0:
    # Bar chart of mean scores
    fig, ax = plt.subplots(figsize=(14, 6))
    
    names = list(results.keys())
    means = [results[n]['mean'] for n in names]
    stds = [results[n]['std'] for n in names]
    
    # Color by category
    colors = []
    for name in names:
        if 'base' in name:
            colors.append('lightcoral')
        elif 'training' in name:
            colors.append('lightgreen')
        elif 'neologism' in name:
            colors.append('lightblue')
        elif 'finetuning' in name:
            colors.append('plum')
        elif 'prompting' in name:
            colors.append('wheat')
        else:
            colors.append('gray')
    
    bars = ax.bar(range(len(names)), means, color=colors, alpha=0.7, yerr=stds, capsize=3)
    
    ax.set_xticks(range(len(names)))
    ax.set_xticklabels(names, rotation=45, ha='right')
    ax.set_ylabel('Capability Score (1-10)')
    ax.set_title('Capability Scores by Method')
    ax.set_ylim(0, 11)
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bar, mean in zip(bars, means):
        ax.text(bar.get_x() + bar.get_width()/2., mean + 0.3,
                f'{mean:.1f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig('capability_scores_visualization.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nVisualization saved to capability_scores_visualization.png")

In [None]:
# ============================================================================
# SAVE RESULTS
# ============================================================================

def convert_to_serializable(obj):
    """Convert numpy types to JSON-serializable Python types."""
    if isinstance(obj, np.number):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    else:
        return obj

# Save results (without individual scores to keep file small)
results_summary = {
    name: {k: v for k, v in stats.items() if k != 'scores'}
    for name, stats in results.items()
}
results_summary = convert_to_serializable(results_summary)

with open('capability_scores_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("Results saved to capability_scores_results.json")