# Kid Mode Method Comparison Analysis

This notebook compares three methods for adapting model outputs to a "kid mode" style:
1. **Neologism** - Using novel terminology approach
2. **LoRA** - Low-Rank Adaptation fine-tuning
3. **Prompting** - Explicit instruction prompting

We analyze gap closure metrics, statistical significance, and visualize the results.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

## 1. Load Data

In [None]:
# Load the gap closure results for each method
with open('kidmode_neo_gap_closure_results.json', 'r') as f:
    neo_results = json.load(f)

with open('kidmode_lora_gap_closure_results.json', 'r') as f:
    lora_results = json.load(f)

with open('kidmode_prompting_gap_closure_results.json', 'r') as f:
    prompting_results = json.load(f)

print("Data loaded successfully!")

## 2. Define Statistics

In [None]:
# Actual statistics from experimental results
methods = ['Baseline', 'Neologism', 'LoRA', 'Prompting']

# Baseline (average across all baselines from JSON files)
baseline_mean = np.mean([neo_results['baseline']['mean'], 
                         lora_results['baseline']['mean'],
                         prompting_results['baseline']['mean']])
baseline_std = np.mean([neo_results['baseline']['std'], 
                        lora_results['baseline']['std'],
                        prompting_results['baseline']['std']])

means = [baseline_mean, 6.136667, 5.763333, 6.970000]
stds = [baseline_std, 2.217654, 2.411221, 2.283805]

# Training target (average across all training sets)
training_mean = np.mean([neo_results['training']['mean'],
                         lora_results['training']['mean'],
                         prompting_results['training']['mean']])

print("Statistics:")
for method, mean, std in zip(methods, means, stds):
    print(f"  {method}: mean={mean:.2f}, std={std:.2f}")
print(f"  Target (Training): {training_mean:.2f}")

## 3. Bar Chart Comparison

In [None]:
def create_bar_chart(methods, means, stds, target, save_path=None):
    """
    Create a bar chart with error bars comparing baseline and the three methods.
    Colors: Gray (Baseline), Red (Neologism), Blue (LoRA), Green (Prompting)
    """
    fig, ax = plt.subplots(figsize=(10, 7))
    
    positions = [0, 1, 2, 3]
    colors = ['#95a5a6', '#e74c3c', '#3498db', '#2ecc71']  # Gray, Red, Blue, Green
    
    # Create bar chart with error bars
    bars = ax.bar(positions, means, yerr=stds, width=0.6, 
                  color=colors, alpha=0.7, edgecolor='black', linewidth=1.5,
                  capsize=5, error_kw={'linewidth': 1.5, 'capthick': 1.5})
    
    # Add target line - GREEN dashed line
    ax.axhline(y=target, color='green', linestyle='--', linewidth=2, label=f'Target ({target:.1f})')
    
    # Formatting
    ax.set_xticks(positions)
    ax.set_xticklabels(methods, fontsize=14)
    ax.set_ylabel('Score', fontsize=14)
    ax.set_title('Kidmode Method Comparison', fontsize=16, fontweight='bold')
    ax.set_ylim(0, 11)
    
    # Add legend with box outline
    legend = ax.legend(loc='upper right', fontsize=12, frameon=True, 
                       fancybox=False, edgecolor='black')
    legend.get_frame().set_linewidth(1.5)
    
    # Add black outline around the graph (spines)
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_color('black')
        spine.set_linewidth(1.5)
    
    # Add grid
    ax.yaxis.grid(True, linestyle='-', alpha=0.3)
    ax.set_axisbelow(True)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to {save_path}")
    
    plt.show()
    return fig

# Create the bar chart
fig1 = create_bar_chart(methods, means, stds, target=training_mean, 
                        save_path='bar_chart_comparison.png')