# GPT-2 Training Monitor

This notebook helps you monitor and analyze your GPT-2 training progress.

In [None]:
import os
import json
import yaml
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")

In [None]:
# Load configuration
config_path = "../configs/config.yaml"

with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

print("Configuration loaded:")
print(f"Model: {config['model']['name']}")
print(f"Training epochs: {config['training']['num_train_epochs']}")
print(f"Batch size: {config['training']['per_device_train_batch_size']}")
print(f"Learning rate: {config['training']['learning_rate']}")

In [None]:
# Find training runs
outputs_dir = "../outputs"

if os.path.exists(outputs_dir):
    training_runs = [d for d in os.listdir(outputs_dir) 
                    if os.path.isdir(os.path.join(outputs_dir, d)) and d.startswith("gpt2_finetuned")]
    
    if training_runs:
        training_runs.sort(key=lambda x: os.path.getmtime(os.path.join(outputs_dir, x)), reverse=True)
        print(f"Found {len(training_runs)} training runs:")
        for i, run in enumerate(training_runs):
            print(f"{i}: {run}")
        
        # Select the latest run by default
        selected_run = training_runs[0]
        run_path = os.path.join(outputs_dir, selected_run)
        print(f"\nAnalyzing latest run: {selected_run}")
    else:
        print("No training runs found. Train a model first!")
        training_runs = []
        selected_run = None
        run_path = None
else:
    print("Outputs directory not found.")
    training_runs = []
    selected_run = None
    run_path = None

In [None]:
# Load training metrics
if run_path:
    trainer_state_path = os.path.join(run_path, "trainer_state.json")
    train_results_path = os.path.join(run_path, "train_results.json")
    eval_results_path = os.path.join(run_path, "eval_results.json")
    
    metrics_data = {}
    
    # Load trainer state
    if os.path.exists(trainer_state_path):
        with open(trainer_state_path, 'r') as f:
            trainer_state = json.load(f)
        metrics_data['trainer_state'] = trainer_state
        print("✅ Trainer state loaded")
    else:
        print("❌ Trainer state not found")
        trainer_state = None
    
    # Load training results
    if os.path.exists(train_results_path):
        with open(train_results_path, 'r') as f:
            train_results = json.load(f)
        metrics_data['train_results'] = train_results
        print("✅ Training results loaded")
    else:
        print("❌ Training results not found")
        train_results = None
    
    # Load evaluation results
    if os.path.exists(eval_results_path):
        with open(eval_results_path, 'r') as f:
            eval_results = json.load(f)
        metrics_data['eval_results'] = eval_results
        print("✅ Evaluation results loaded")
    else:
        print("❌ Evaluation results not found")
        eval_results = None
else:
    print("No run path available for analysis")
    trainer_state = None
    train_results = None
    eval_results = None

In [None]:
# Training Summary
if train_results:
    print("📊 Training Summary")
    print("=" * 40)
    
    for key, value in train_results.items():
        if isinstance(value, float):
            print(f"{key}: {value:.4f}")
        else:
            print(f"{key}: {value}")

if eval_results:
    print("\n📈 Evaluation Summary")
    print("=" * 40)
    
    for key, value in eval_results.items():
        if isinstance(value, float):
            print(f"{key}: {value:.4f}")
        else:
            print(f"{key}: {value}")

In [None]:
# Plot training curves
if trainer_state and trainer_state.get('log_history'):
    log_history = trainer_state['log_history']
    
    # Convert to DataFrame
    df = pd.DataFrame(log_history)
    
    # Separate training and evaluation logs
    train_logs = df[df['train_loss'].notna()].copy() if 'train_loss' in df.columns else pd.DataFrame()
    eval_logs = df[df['eval_loss'].notna()].copy() if 'eval_loss' in df.columns else pd.DataFrame()
    
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f'Training Progress: {selected_run}', fontsize=16)
    
    # Training Loss
    if not train_logs.empty:
        axes[0, 0].plot(train_logs['step'], train_logs['train_loss'], 'b-', linewidth=2)
        axes[0, 0].set_title('Training Loss')
        axes[0, 0].set_xlabel('Step')
        axes[0, 0].set_ylabel('Loss')
        axes[0, 0].grid(True)
    
    # Evaluation Loss
    if not eval_logs.empty:
        axes[0, 1].plot(eval_logs['step'], eval_logs['eval_loss'], 'r-', linewidth=2)
        axes[0, 1].set_title('Evaluation Loss')
        axes[0, 1].set_xlabel('Step')
        axes[0, 1].set_ylabel('Loss')
        axes[0, 1].grid(True)
    
    # Learning Rate
    if not train_logs.empty and 'learning_rate' in train_logs.columns:
        axes[1, 0].plot(train_logs['step'], train_logs['learning_rate'], 'g-', linewidth=2)
        axes[1, 0].set_title('Learning Rate')
        axes[1, 0].set_xlabel('Step')
        axes[1, 0].set_ylabel('Learning Rate')
        axes[1, 0].grid(True)
    
    # Combined Loss Comparison
    if not train_logs.empty:
        axes[1, 1].plot(train_logs['step'], train_logs['train_loss'], 'b-', label='Training', linewidth=2)
    if not eval_logs.empty:
        axes[1, 1].plot(eval_logs['step'], eval_logs['eval_loss'], 'r-', label='Evaluation', linewidth=2)
    axes[1, 1].set_title('Loss Comparison')
    axes[1, 1].set_xlabel('Step')
    axes[1, 1].set_ylabel('Loss')
    axes[1, 1].legend()
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Display statistics
    print("\n📊 Training Statistics:")
    if not train_logs.empty:
        print(f"Final training loss: {train_logs['train_loss'].iloc[-1]:.4f}")
        print(f"Min training loss: {train_logs['train_loss'].min():.4f}")
    if not eval_logs.empty:
        print(f"Final evaluation loss: {eval_logs['eval_loss'].iloc[-1]:.4f}")
        print(f"Min evaluation loss: {eval_logs['eval_loss'].min():.4f}")
else:
    print("No training history available for plotting")

In [None]:
# Model file analysis
if run_path:
    print("📁 Model Files Analysis")
    print("=" * 40)
    
    model_files = [
        'pytorch_model.bin',
        'config.json',
        'tokenizer.json',
        'tokenizer_config.json',
        'training_args.bin',
        'trainer_state.json'
    ]
    
    total_size = 0
    for file in model_files:
        file_path = os.path.join(run_path, file)
        if os.path.exists(file_path):
            size = os.path.getsize(file_path)
            total_size += size
            print(f"✅ {file}: {size / 1024 / 1024:.1f} MB")
        else:
            print(f"❌ {file}: Not found")
    
    print(f"\nTotal model size: {total_size / 1024 / 1024:.1f} MB")
    print(f"Model directory: {run_path}")

In [None]:
# Compare multiple training runs
if len(training_runs) > 1:
    print("🔄 Comparing Multiple Training Runs")
    print("=" * 50)
    
    comparison_data = []
    
    for run in training_runs[:5]:  # Compare up to 5 runs
        run_dir = os.path.join(outputs_dir, run)
        eval_file = os.path.join(run_dir, "eval_results.json")
        train_file = os.path.join(run_dir, "train_results.json")
        
        run_data = {'run_name': run}
        
        if os.path.exists(eval_file):
            with open(eval_file, 'r') as f:
                eval_data = json.load(f)
            run_data.update({f'eval_{k}': v for k, v in eval_data.items()})
        
        if os.path.exists(train_file):
            with open(train_file, 'r') as f:
                train_data = json.load(f)
            run_data.update({f'train_{k}': v for k, v in train_data.items()})
        
        comparison_data.append(run_data)
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        print(comparison_df[['run_name', 'eval_loss', 'train_loss']].round(4))
        
        # Plot comparison
        if 'eval_loss' in comparison_df.columns:
            plt.figure(figsize=(12, 6))
            plt.bar(range(len(comparison_df)), comparison_df['eval_loss'])
            plt.xlabel('Training Run')
            plt.ylabel('Evaluation Loss')
            plt.title('Evaluation Loss Comparison Across Runs')
            plt.xticks(range(len(comparison_df)), [run[:20] + '...' if len(run) > 20 else run for run in comparison_df['run_name']], rotation=45)
            plt.tight_layout()
            plt.show()
else:
    print("Only one training run available for comparison")

## Training Tips

Based on your training results:

### Good Signs:
- ✅ Training loss decreasing over time
- ✅ Evaluation loss decreasing (not increasing)
- ✅ Gap between training and evaluation loss is reasonable

### Warning Signs:
- ⚠️ Evaluation loss increasing while training loss decreases (overfitting)
- ⚠️ Very large gap between training and evaluation loss
- ⚠️ Loss plateauing too early

### Adjustments:
- **If overfitting**: Reduce learning rate, add regularization, or reduce epochs
- **If underfitting**: Increase learning rate, train longer, or use larger model
- **If loss plateaus**: Adjust learning rate schedule or try different optimizer