In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add project to path
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root / 'src'))

print("🏝️ Survivor Model Training & Analysis")
print("="*50)

try:
    from model_trainer import SurvivorModelTrainer
    from model_evaluator import SurvivorModelEvaluator
    print("✅ All modules imported successfully")
    
    # Initialize trainer
    trainer = SurvivorModelTrainer()
    
    # Train all models (this may take a few minutes)
    print("🚀 Starting model training...")
    results = trainer.train_all_models()
    
    if results:
        print("✅ Training completed successfully!")
        print(f"✅ Trained {sum(len(task_results) for task_results in results.values())} models")
    else:
        print("❌ Training failed!")
        
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Make sure you're running from the notebooks/ directory")
except Exception as e:
    print(f"❌ Error during training: {e}")
    import traceback
    traceback.print_exc()

✅ SurvivorDataProcessor imported successfully
✅ All modules imported successfully


In [4]:
# Get performance comparison
if results:
    print("📊 MODEL PERFORMANCE COMPARISON")
    print("-" * 40)
    
    comparison_df = trainer.get_model_comparison()
    display(comparison_df)
    
    # Plot comparison
    trainer.plot_model_comparison()
    
    # Print insights
    print("\n💡 Key Insights:")
    
    # Best models for each task
    for task in comparison_df['task'].unique():
        task_data = comparison_df[comparison_df['task'] == task]
        best_model = task_data.loc[task_data['score'].idxmax()]
        print(f"   • Best {task.replace('_', ' ')}: {best_model['model']} ({best_model['score']:.3f})")

NameError: name 'results' is not defined

In [None]:
# Analyze feature importance for key predictions
key_tasks = ['merge_prediction', 'finale_prediction', 'winner_prediction']

for task in key_tasks:
    print(f"\n🔍 {task.replace('_', ' ').upper()} - TOP FEATURES")
    print("-" * 40)
    
    # Show Random Forest features (usually most interpretable)
    features = trainer.get_top_features(task, 'random_forest', top_n=10)
    if features is not None:
        display(features)
        trainer.plot_feature_importance(task, 'random_forest', top_n=15)
    else:
        print("No feature importance available")

In [None]:
# Initialize evaluator
evaluator = SurvivorModelEvaluator()

# Performance summary
summary = evaluator.create_performance_summary()
print("🎯 COMPREHENSIVE PERFORMANCE SUMMARY")
display(summary)

# Create detailed visualizations
print("\n📊 Creating detailed evaluation plots...")

# Confusion matrices
evaluator.plot_confusion_matrices()

# ROC curves
evaluator.plot_roc_curves()

# Prediction vs actual for regression
evaluator.plot_prediction_vs_actual()

In [None]:
# Analyze overall feature importance patterns
print("🧠 FEATURE IMPORTANCE ANALYSIS")
print("-" * 40)

importance_df = evaluator.analyze_feature_importance_patterns()

if importance_df is not None:
    print("Top 15 Most Important Features Across All Models:")
    display(importance_df[['mean_importance']].head(15))
    
    # Analyze feature categories
    print("\n📋 Feature Category Analysis:")
    
    def categorize_feature(feature_name):
        if any(x in feature_name.lower() for x in ['challenge', 'win_rate', 'tribal', 'individual']):
            return 'Challenge Performance'
        elif any(x in feature_name.lower() for x in ['age', 'gender', 'fitness', 'athletic', 'physical']):
            return 'Demographics/Physical'
        elif any(x in feature_name.lower() for x in ['strategic', 'alliance', 'advantage', 'votes']):
            return 'Strategic Gameplay'
        elif any(x in feature_name.lower() for x in ['occupation', 'home', 'region', 'relationship']):
            return 'Background/Social'
        elif any(x in feature_name.lower() for x in ['knowledge', 'survivor']):
            return 'Game Knowledge'
        else:
            return 'Other'
    
    # Apply categorization
    top_20_features = importance_df.head(20).copy()
    top_20_features['category'] = top_20_features.index.map(categorize_feature)
    
    # Group by category
    category_importance = top_20_features.groupby('category')['mean_importance'].agg(['sum', 'count', 'mean']).round(3)
    category_importance.columns = ['Total_Importance', 'Feature_Count', 'Avg_Importance']
    category_importance = category_importance.sort_values('Total_Importance', ascending=False)
    
    display(category_importance)
    
    # Plot category importance
    plt.figure(figsize=(10, 6))
    plt.bar(category_importance.index, category_importance['Total_Importance'], 
            alpha=0.7, color='steelblue')
    plt.title('Feature Importance by Category (Top 20 Features)')
    plt.xlabel('Feature Category')
    plt.ylabel('Total Importance Score')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()