In [None]:
# Cell 1: Setup and Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
import os
from pathlib import Path

# Add parent directory to path (for accessing src module)
current_dir = Path.cwd()
if 'notebooks' in str(current_dir):
    project_root = current_dir.parent
else:
    project_root = current_dir

sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / 'src'))

print(f"Project root: {project_root}")
print(f"Current working directory: {current_dir}")

# Import our data processor
try:
    from data_processor import SurvivorDataProcessor
    print("✅ Successfully imported SurvivorDataProcessor")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Available files in src/:")
    src_dir = project_root / 'src'
    if src_dir.exists():
        for file in src_dir.iterdir():
            print(f"  - {file.name}")
    else:
        print("  src/ directory not found!")

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("\n🏝️ Survivor Prediction Model - EDA")
print("="*50)

# Initialize processor and load data
try:
    processor = SurvivorDataProcessor()
    df = processor.load_data()

    if df is not None:
        print(f"✅ Dataset shape: {df.shape}")
        print(f"✅ Seasons covered: {df['Season'].min()} to {df['Season'].max()}")
        
        # Display first few rows
        print("\n📋 First 3 rows:")
        display(df.head(3))
        
        # Basic info
        print(f"\n📊 Dataset Info:")
        print(f"   Rows: {len(df)}")
        print(f"   Columns: {len(df.columns)}")
        print(f"   Missing values: {df.isnull().sum().sum()}")
        
    else:
        print("❌ Failed to load data!")
        
except NameError:
    print("❌ processor variable not defined - check imports above")
except Exception as e:
    print(f"❌ Unexpected error: {e}")
    import traceback
    traceback.print_exc()

Import error: No module named 'config.config'; 'config' is not a package
Falling back to direct imports...
Config import failed. Running from project root...


ModuleNotFoundError: No module named 'config.config'; 'config' is not a package

In [7]:
# Process data to get targets
X, targets, df_processed = processor.process_full_pipeline(df)

# Create success metrics visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Survivor Success Metrics Distribution', fontsize=16, fontweight='bold')

# Placement distribution
axes[0,0].hist(targets['placement'], bins=18, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Final Placement Distribution')
axes[0,0].set_xlabel('Placement')
axes[0,0].set_ylabel('Count')

# Days lasted
axes[0,1].hist(targets['days_lasted'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('Days Lasted Distribution')
axes[0,1].set_xlabel('Days')
axes[0,1].set_ylabel('Count')

# Success tier
success_counts = targets['success_tier'].value_counts()
axes[0,2].bar(success_counts.index, success_counts.values, alpha=0.7, color='coral')
axes[0,2].set_title('Success Tier Distribution')
axes[0,2].set_xlabel('Success Tier')
axes[0,2].set_ylabel('Count')
axes[0,2].tick_params(axis='x', rotation=45)

# Binary outcomes
binary_outcomes = ['made_merge', 'made_finale', 'won_game']
for i, outcome in enumerate(binary_outcomes):
    counts = targets[outcome].value_counts()
    axes[1,i].bar(['No', 'Yes'], counts.values, alpha=0.7, 
                  color=['lightcoral', 'lightblue'])
    axes[1,i].set_title(f'{outcome.replace("_", " ").title()} Distribution')
    axes[1,i].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Print success rates
print("Success Rates:")
print(f"Made Merge: {targets['made_merge'].mean():.1%}")
print(f"Made Finale: {targets['made_finale'].mean():.1%}")  
print(f"Won Game: {targets['won_game'].mean():.1%}")

NameError: name 'processor' is not defined

In [None]:
# Analyze key predictive features
key_features = ['Age', 'Self_Reported_Fitness', 'Alliance_Count', 
                'tribal_win_rate', 'individual_win_rate', 'Pre_Game_Target_Size']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Key Features vs Final Placement', fontsize=16, fontweight='bold')

for i, feature in enumerate(key_features):
    row, col = i // 3, i % 3
    
    # Create scatter plot with trend line
    axes[row, col].scatter(df_processed[feature], df_processed['Final_Placement'], 
                          alpha=0.6, color='steelblue')
    
    # Add trend line
    z = np.polyfit(df_processed[feature], df_processed['Final_Placement'], 1)
    p = np.poly1d(z)
    axes[row, col].plot(df_processed[feature], p(df_processed[feature]), 
                       "r--", alpha=0.8, linewidth=2)
    
    # Correlation
    correlation = df_processed[feature].corr(df_processed['Final_Placement'])
    axes[row, col].set_title(f'{feature} (r = {correlation:.3f})')
    axes[row, col].set_xlabel(feature.replace('_', ' ').title())
    axes[row, col].set_ylabel('Final Placement')

plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features
categorical_features = ['Gender', 'Strategic_Archetype', 'Survivor_Knowledge', 'Athletic_Background']

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Categorical Features vs Success Rate', fontsize=16, fontweight='bold')

for i, feature in enumerate(categorical_features):
    row, col = i // 2, i % 2
    
    # Calculate success rate by category (made merge)
    success_by_cat = df_processed.groupby(feature)['made_merge_binary'].agg(['mean', 'count']).reset_index()
    success_by_cat = success_by_cat[success_by_cat['count'] >= 3]  # Filter small groups
    
    bars = axes[row, col].bar(success_by_cat[feature], success_by_cat['mean'], 
                             alpha=0.7, color='lightsteelblue')
    axes[row, col].set_title(f'{feature} vs Merge Success Rate')
    axes[row, col].set_xlabel(feature.replace('_', ' ').title())
    axes[row, col].set_ylabel('Merge Success Rate')
    axes[row, col].tick_params(axis='x', rotation=45)
    
    # Add count labels on bars
    for bar, count in zip(bars, success_by_cat['count']):
        height = bar.get_height()
        axes[row, col].text(bar.get_x() + bar.get_width()/2., height + 0.01,
                           f'n={count}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

# Create correlation matrix for numerical features
numerical_cols = processor.numerical_features + ['Final_Placement', 'Days_Lasted']
correlation_matrix = df_processed[numerical_cols].corr()

# Create heatmap
plt.figure(figsize=(14, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
            center=0, square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Identify highly correlated features
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:  # High correlation threshold
            high_corr_pairs.append((correlation_matrix.columns[i], 
                                  correlation_matrix.columns[j], corr_val))

print("Highly correlated feature pairs (|r| > 0.7):")
for pair in high_corr_pairs:
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")