# 04 - Advanced Models & Hyperparameter Tuning

**Objective:** Train and tune advanced tree-based models to beat Steph's 88.4% R²

**Models:**
1. Random Forest (Tuned)
2. Gradient Boosting (Tuned)
3. XGBoost (Tuned)
4. LightGBM (Tuned)

**Current Best:** 83.91% R² (XGBoost basic)

**Input:** `data/X_train.csv`, `data/X_test.csv`, `data/y_train.csv`, `data/y_test.csv`

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
import json
import joblib
import time
from datetime import datetime

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

import xgboost as xgb
import lightgbm as lgb

print(f"Notebook run: {datetime.now().isoformat()}")

Notebook run: 2025-10-24T09:55:24.208397


In [None]:
# Load processed data (portable path)
ROOT = Path.cwd()
DATA_DIR = ROOT / 'data'
MODELS_DIR = ROOT / 'models'
MODELS_DIR.mkdir(exist_ok=True)

X_train = pd.read_csv(DATA_DIR / 'X_train.csv')
X_test = pd.read_csv(DATA_DIR / 'X_test.csv')
y_train = pd.read_csv(DATA_DIR / 'y_train.csv')['ClosePrice'].values
y_test = pd.read_csv(DATA_DIR / 'y_test.csv')['ClosePrice'].values

# Sanitize feature names: LightGBM doesn't accept special JSON characters in column names
import re
def _sanitize_columns(cols):
    return [re.sub(r'[^0-9A-Za-z_]', '_', str(c)) for c in cols]
X_train.columns = _sanitize_columns(X_train.columns)
X_test.columns = _sanitize_columns(X_test.columns)

print(f"Training: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Testing: {X_test.shape[0]:,} samples")

Training: 150,311 samples, 1022 features
Testing: 22,759 samples

Target: Beat Steph's 88.4% R²


In [None]:
# Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    results = {
        'model': model_name,
        'train_r2': r2_score(y_train, y_pred_train),
        'test_r2': r2_score(y_test, y_pred_test),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'test_mae': mean_absolute_error(y_test, y_pred_test),
        'test_mdape': np.median(np.abs((y_test - y_pred_test) / y_test)) * 100,
        'train_time': train_time
    }

    print(f"\n{'='*70}")
    print(f"{model_name}")
    print(f"{'='*70}")
    print(f"Train R²: {results['train_r2']:.4f}")
    # Build a status string to avoid nested f-strings and quoting issues
    steph_threshold = 0.884
    if results['test_r2'] > steph_threshold:
        status = '🎉 BEATS STEPH!'
    else:
        status = f"(Gap: {(steph_threshold - results['test_r2']) * 100:.2f}% to Steph)"
    print(f"Test R²:  {results['test_r2']:.4f} {status}")
    print(f"RMSE:     ${results['test_rmse']:,.0f}")
    print(f"MAE:      ${results['test_mae']:,.0f}")
    print(f"MdAPE:    {results['test_mdape']:.2f}%")
    print(f"Time:     {train_time:.1f}s")

    return results, model

advanced_results = []

## 1. Random Forest (Hyperparameter Tuning)

Steph's best model was Random Forest with 88.4% R². Let's tune it aggressively.

In [None]:
print("Tuning Random Forest (this will take several minutes)...\n")

rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [15, 20, 25, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.3, 0.5],
    'bootstrap': [True, False]
}

rf_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_distributions=rf_param_dist,
    n_iter=20,  # Test 20 different combinations
    cv=3,  # 3-fold CV to save memory
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=4  # Limit parallel jobs
)

rf_search.fit(X_train, y_train)

print(f"\nBest Random Forest params: {rf_search.best_params_}")
print(f"Best CV R²: {rf_search.best_score_:.4f}")

rf_results, rf_model = evaluate_model(rf_search.best_estimator_, X_train, X_test, 
                                      y_train, y_test, "Random Forest (Tuned)")
advanced_results.append(rf_results)

Tuning Random Forest (this will take several minutes)...

Fitting 3 folds for each of 20 candidates, totalling 60 fits


## 2. Gradient Boosting (Hyperparameter Tuning)

In [None]:
print("Tuning Gradient Boosting...\n")

gb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', 0.3, 0.5]
}

gb_search = RandomizedSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_distributions=gb_param_dist,
    n_iter=20,
    cv=3,
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=4
)

gb_search.fit(X_train, y_train)

print(f"\nBest Gradient Boosting params: {gb_search.best_params_}")
print(f"Best CV R²: {gb_search.best_score_:.4f}")

gb_results, gb_model = evaluate_model(gb_search.best_estimator_, X_train, X_test,
                                      y_train, y_test, "Gradient Boosting (Tuned)")
advanced_results.append(gb_results)

## 3. XGBoost (Aggressive Hyperparameter Tuning)

Current best: 83.91% R². Let's push it higher!

In [None]:
print("Tuning XGBoost (current best: 83.91%)...\n")

xgb_param_dist = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 9, 11],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

xgb_search = RandomizedSearchCV(
    xgb.XGBRegressor(random_state=42, tree_method='hist', n_jobs=-1),
    param_distributions=xgb_param_dist,
    n_iter=25,  # More iterations for better tuning
    cv=3,
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=4
)

xgb_search.fit(X_train, y_train)

print(f"\nBest XGBoost params: {xgb_search.best_params_}")
print(f"Best CV R²: {xgb_search.best_score_:.4f}")

xgb_results, xgb_model = evaluate_model(xgb_search.best_estimator_, X_train, X_test,
                                        y_train, y_test, "XGBoost (Tuned)")
advanced_results.append(xgb_results)

## 4. LightGBM (Fast Gradient Boosting)

LightGBM is faster and often more accurate than XGBoost.

In [None]:
print("Tuning LightGBM...\n")

lgb_param_dist = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 9, -1],
    'num_leaves': [31, 50, 100],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

lgb_search = RandomizedSearchCV(
    lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    param_distributions=lgb_param_dist,
    n_iter=25,
    cv=3,
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=4
)

lgb_search.fit(X_train, y_train)

print(f"\nBest LightGBM params: {lgb_search.best_params_}")
print(f"Best CV R²: {lgb_search.best_score_:.4f}")

lgb_results, lgb_model = evaluate_model(lgb_search.best_estimator_, X_train, X_test,
                                        y_train, y_test, "LightGBM (Tuned)")
advanced_results.append(lgb_results)

## Results Summary & Comparison to Steph

In [None]:
results_df = pd.DataFrame(advanced_results).sort_values('test_r2', ascending=False)

print("\n" + "="*80)
print("ADVANCED MODELS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))

# Compare to Steph
best_r2 = results_df.iloc[0]['test_r2']
steph_r2 = 0.884

print(f"\n{'='*80}")
print("COMPARISON TO STEPH'S BASELINE")
print("="*80)
print(f"Our Best:    {best_r2*100:.2f}% R² ({results_df.iloc[0]['model']})")
print(f"Steph Best:  {steph_r2*100:.2f}% R² (Random Forest)")
print(f"Difference:  {(best_r2 - steph_r2)*100:+.2f} percentage points")

if best_r2 > steph_r2:
    print(f"\n🎉 SUCCESS! We BEAT Steph's baseline by {(best_r2 - steph_r2)*100:.2f}%!")
else:
    print(f"\n⚠️  Still {(steph_r2 - best_r2)*100:.2f}% behind Steph. Consider:")
    print("   - Ensemble methods (stacking, blending)")
    print("   - Feature engineering improvements")
    print("   - More aggressive hyperparameter tuning")

# Save results
results_df.to_csv(MODELS_DIR / 'advanced_models_results.csv', index=False)

# Save best model
best_model_name = results_df.iloc[0]['model']
if 'Random Forest' in best_model_name:
    best_model = rf_model
elif 'Gradient Boosting' in best_model_name:
    best_model = gb_model
elif 'XGBoost' in best_model_name:
    best_model = xgb_model
else:
    best_model = lgb_model

joblib.dump(best_model, MODELS_DIR / 'best_advanced_model.joblib')

# Save summary JSON
summary = {
    'best_model': best_model_name,
    'best_r2': float(best_r2),
    'steph_r2': float(steph_r2),
    'improvement': float(best_r2 - steph_r2),
    'timestamp': datetime.now().isoformat(),
    'n_features': X_train.shape[1],
    'n_train_samples': X_train.shape[0],
    'n_test_samples': X_test.shape[0]
}

with open(MODELS_DIR / 'advanced_models_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nModels saved to {MODELS_DIR}")
print("\n✅ Advanced models training complete!")