# 06 - Ensemble & Advanced Models

**Objective:** Push beyond single model performance with ensemble techniques

**Methods:**
1. Voting Regressor (combine RF, XGB, LightGBM)
2. Stacking Regressor (meta-learner)
3. Blending (manual ensemble)
4. CatBoost (gradient boosting variant)
5. Neural Network (simple feedforward)

**Target:** Beat Steph's 88.4% R² and current best from notebook 04

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
import json
import joblib
import time
from datetime import datetime

from sklearn.ensemble import VotingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

import xgboost as xgb
import lightgbm as lgb

print(f"Ensemble & Advanced Models - {datetime.now().isoformat()}")

In [None]:
# Load data
ROOT = Path.cwd()
DATA_DIR = ROOT / 'data'
MODELS_DIR = ROOT / 'models'

X_train = pd.read_csv(DATA_DIR / 'X_train.csv')
X_test = pd.read_csv(DATA_DIR / 'X_test.csv')
y_train = pd.read_csv(DATA_DIR / 'y_train.csv')['ClosePrice'].values
y_test = pd.read_csv(DATA_DIR / 'y_test.csv')['ClosePrice'].values

# Load previous best from notebook 04
with open(MODELS_DIR / 'advanced_models_summary.json') as f:
    prev_best = json.load(f)

print(f"Data: {X_train.shape[0]:,} train, {X_test.shape[0]:,} test")
print(f"Features: {X_train.shape[1]}")
print(f"\nPrevious Best: {prev_best['best_model']} with {prev_best['best_r2']:.4f} R²")
print(f"Target: Beat Steph's 88.4% R²")

In [None]:
# Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    results = {
        'model': model_name,
        'train_r2': r2_score(y_train, y_pred_train),
        'test_r2': r2_score(y_test, y_pred_test),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'test_mae': mean_absolute_error(y_test, y_pred_test),
        'test_mdape': np.median(np.abs((y_test - y_pred_test) / y_test)) * 100,
        'train_time': train_time
    }
    
    print(f"\n{'='*70}")
    print(f"{model_name}")
    print(f"{'='*70}")
    print(f"Train R²: {results['train_r2']:.4f}")
    print(f"Test R²:  {results['test_r2']:.4f}", end='')
    
    if results['test_r2'] > 0.884:
        print(f" 🎉 BEATS STEPH!")
    elif results['test_r2'] > prev_best['best_r2']:
        print(f" ⬆️ NEW BEST! (was {prev_best['best_r2']:.4f})")
    else:
        gap = (0.884 - results['test_r2']) * 100
        print(f" (Gap to Steph: {gap:.2f}%)")
    
    print(f"RMSE:     ${results['test_rmse']:,.0f}")
    print(f"MAE:      ${results['test_mae']:,.0f}")
    print(f"MdAPE:    {results['test_mdape']:.2f}%")
    print(f"Time:     {train_time:.1f}s")
    
    return results, model

ensemble_results = []

## 1. Voting Regressor (Simple Ensemble)

Average predictions from multiple strong models

In [None]:
print("Building Voting Regressor...\n")

# Define base estimators with reasonable hyperparameters
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=25,
    min_samples_split=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist',
    n_jobs=-1
)

lgb_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=50,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

voting = VotingRegressor(
    estimators=[
        ('rf', rf),
        ('xgb', xgb_model),
        ('lgb', lgb_model)
    ],
    n_jobs=1  # Base models already use all cores
)

voting_results, voting_model = evaluate_model(voting, X_train, X_test, y_train, y_test,
                                              "Voting Ensemble (RF + XGB + LGB)")
ensemble_results.append(voting_results)

## 2. Stacking Regressor (Meta-Learner)

Train a meta-model to optimally combine base models

In [None]:
print("Building Stacking Regressor...\n")

# Use same base estimators
rf_stack = RandomForestRegressor(
    n_estimators=200, max_depth=25, min_samples_split=5,
    max_features='sqrt', random_state=42, n_jobs=-1
)

xgb_stack = xgb.XGBRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=7,
    subsample=0.8, colsample_bytree=0.8, random_state=42,
    tree_method='hist', n_jobs=-1
)

lgb_stack = lgb.LGBMRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=7,
    num_leaves=50, subsample=0.8, colsample_bytree=0.8,
    random_state=42, n_jobs=-1, verbose=-1
)

# Ridge as meta-learner
stacking = StackingRegressor(
    estimators=[
        ('rf', rf_stack),
        ('xgb', xgb_stack),
        ('lgb', lgb_stack)
    ],
    final_estimator=Ridge(alpha=10.0),
    cv=3,  # 3-fold CV for meta-features
    n_jobs=1
)

stacking_results, stacking_model = evaluate_model(stacking, X_train, X_test, y_train, y_test,
                                                   "Stacking Ensemble (Ridge Meta-Learner)")
ensemble_results.append(stacking_results)

## 3. CatBoost (Alternative Gradient Boosting)

CatBoost handles categorical features natively and often outperforms XGBoost/LightGBM

In [None]:
try:
    from catboost import CatBoostRegressor
    
    print("Training CatBoost...\n")
    
    catboost = CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=7,
        l2_leaf_reg=3,
        random_seed=42,
        verbose=False
    )
    
    catboost_results, catboost_model = evaluate_model(catboost, X_train, X_test, y_train, y_test,
                                                       "CatBoost")
    ensemble_results.append(catboost_results)
    
except ImportError:
    print("CatBoost not installed. Skipping. (Install with: pip install catboost)")

## 4. Weighted Blending

Manually weighted ensemble based on individual model performance

In [None]:
print("Building Weighted Blend...\n")

# Train individual models
rf_blend = RandomForestRegressor(
    n_estimators=200, max_depth=25, min_samples_split=5,
    max_features='sqrt', random_state=42, n_jobs=-1
)
rf_blend.fit(X_train, y_train)

xgb_blend = xgb.XGBRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=7,
    subsample=0.8, colsample_bytree=0.8, random_state=42,
    tree_method='hist', n_jobs=-1
)
xgb_blend.fit(X_train, y_train)

lgb_blend = lgb.LGBMRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=7,
    num_leaves=50, subsample=0.8, colsample_bytree=0.8,
    random_state=42, n_jobs=-1, verbose=-1
)
lgb_blend.fit(X_train, y_train)

# Get individual R² scores on test
rf_r2 = r2_score(y_test, rf_blend.predict(X_test))
xgb_r2 = r2_score(y_test, xgb_blend.predict(X_test))
lgb_r2 = r2_score(y_test, lgb_blend.predict(X_test))

print(f"Individual R² scores:")
print(f"  RF:  {rf_r2:.4f}")
print(f"  XGB: {xgb_r2:.4f}")
print(f"  LGB: {lgb_r2:.4f}")

# Weighted blend (proportional to R²)
total = rf_r2 + xgb_r2 + lgb_r2
w_rf = rf_r2 / total
w_xgb = xgb_r2 / total
w_lgb = lgb_r2 / total

print(f"\nWeights: RF={w_rf:.3f}, XGB={w_xgb:.3f}, LGB={w_lgb:.3f}")

# Blended predictions
y_pred_blend = (
    w_rf * rf_blend.predict(X_test) +
    w_xgb * xgb_blend.predict(X_test) +
    w_lgb * lgb_blend.predict(X_test)
)

blend_r2 = r2_score(y_test, y_pred_blend)
blend_results = {
    'model': 'Weighted Blend',
    'train_r2': np.nan,
    'test_r2': blend_r2,
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_blend)),
    'test_mae': mean_absolute_error(y_test, y_pred_blend),
    'test_mdape': np.median(np.abs((y_test - y_pred_blend) / y_test)) * 100,
    'train_time': 0
}

print(f"\n{'='*70}")
print(f"Weighted Blend")
print(f"{'='*70}")
print(f"Test R²:  {blend_r2:.4f}", end='')
if blend_r2 > 0.884:
    print(f" 🎉 BEATS STEPH!")
elif blend_r2 > prev_best['best_r2']:
    print(f" ⬆️ NEW BEST!")
else:
    print(f" (Gap: {(0.884 - blend_r2)*100:.2f}%)")

ensemble_results.append(blend_results)

## 5. Neural Network (Simple Feedforward)

Test if deep learning can capture patterns tree models miss

In [None]:
print("Training Neural Network...\n")

# Scale features for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Simple feedforward network
mlp = MLPRegressor(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=512,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=200,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42,
    verbose=False
)

mlp_results, mlp_model = evaluate_model(mlp, X_train_scaled, X_test_scaled, y_train, y_test,
                                        "Neural Network (3-layer MLP)")
ensemble_results.append(mlp_results)

# Save scaler
joblib.dump(scaler, MODELS_DIR / 'scaler_nn.joblib')

## Final Results & Comparison

In [None]:
results_df = pd.DataFrame(ensemble_results).sort_values('test_r2', ascending=False)

print("\n" + "="*80)
print("ENSEMBLE MODELS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))

# Overall best (including notebook 04)
best_r2 = results_df.iloc[0]['test_r2']
best_model_name = results_df.iloc[0]['model']
steph_r2 = 0.884

print(f"\n{'='*80}")
print("FINAL COMPARISON")
print("="*80)
print(f"\nNotebook 04 Best: {prev_best['best_r2']*100:.2f}% R² ({prev_best['best_model']})")
print(f"Notebook 06 Best: {best_r2*100:.2f}% R² ({best_model_name})")
print(f"Steph Baseline:   {steph_r2*100:.2f}% R² (Random Forest)")

# Determine absolute best
absolute_best_r2 = max(best_r2, prev_best['best_r2'])
absolute_best_name = best_model_name if best_r2 > prev_best['best_r2'] else prev_best['best_model']

print(f"\n{'='*80}")
print(f"OVERALL BEST MODEL: {absolute_best_name}")
print(f"R²: {absolute_best_r2*100:.2f}%")
print(f"vs Steph: {(absolute_best_r2 - steph_r2)*100:+.2f} percentage points")
print("="*80)

if absolute_best_r2 > steph_r2:
    print(f"\n🎉🎉 SUCCESS! We BEAT Steph by {(absolute_best_r2 - steph_r2)*100:.2f}%! 🎉🎉")
else:
    gap = (steph_r2 - absolute_best_r2) * 100
    print(f"\n⚠️  Still {gap:.2f}% behind Steph")
    print(f"\nNext steps:")
    print("  1. Analyze feature importance from notebook 05")
    print("  2. Engineer more powerful features")
    print("  3. Try deeper neural networks or transformers")
    print("  4. Investigate data quality and outliers")
    print("  5. Consider domain-specific feature engineering")

# Save results
results_df.to_csv(MODELS_DIR / 'ensemble_models_results.csv', index=False)

# Save best ensemble model
if best_model_name == 'Voting Ensemble (RF + XGB + LGB)':
    best_ensemble = voting_model
elif best_model_name == 'Stacking Ensemble (Ridge Meta-Learner)':
    best_ensemble = stacking_model
elif 'CatBoost' in best_model_name:
    best_ensemble = catboost_model
elif 'Neural Network' in best_model_name:
    best_ensemble = mlp_model
else:
    best_ensemble = None

if best_ensemble is not None:
    joblib.dump(best_ensemble, MODELS_DIR / 'best_ensemble_model.joblib')

# Save overall summary
summary = {
    'notebook_04_best': prev_best['best_model'],
    'notebook_04_r2': prev_best['best_r2'],
    'notebook_06_best': best_model_name,
    'notebook_06_r2': float(best_r2),
    'overall_best': absolute_best_name,
    'overall_best_r2': float(absolute_best_r2),
    'steph_r2': float(steph_r2),
    'beat_steph': bool(absolute_best_r2 > steph_r2),
    'improvement_over_steph': float(absolute_best_r2 - steph_r2),
    'timestamp': datetime.now().isoformat()
}

with open(MODELS_DIR / 'final_ensemble_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nModels saved to {MODELS_DIR}")
print("\n✅ Ensemble training complete!")