# 04.5 - Dedicated XGBoost Tuning

This notebook isolates XGBoost tuning. It contains: the original XGBoost tuning cell (kept as a reference) and an improved, safer tuning cell designed to avoid long timeouts and failures while still searching useful hyperparameter space.

Run this on an interactive node (Amarel) with a conda environment that has papermill/jupyter installed. Save artifacts land in `models/`.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
import json
import joblib
import time
from datetime import datetime

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

import xgboost as xgb

print(f"Notebook run: {datetime.now().isoformat()}")

In [None]:
# Load processed data (portable path)
ROOT = Path.cwd()
DATA_DIR = ROOT / 'data'
MODELS_DIR = ROOT / 'models'
MODELS_DIR.mkdir(exist_ok=True)

X_train = pd.read_csv(DATA_DIR / 'X_train.csv')
X_test = pd.read_csv(DATA_DIR / 'X_test.csv')
y_train = pd.read_csv(DATA_DIR / 'y_train.csv')['ClosePrice'].values
y_test = pd.read_csv(DATA_DIR / 'y_test.csv')['ClosePrice'].values

# Sanitize column names for tree-based libraries (LightGBM/XGBoost)
import re
def _clean_col(c):
    return re.sub(r'[^0-9a-zA-Z_]', '_', str(c))

orig_cols = list(X_train.columns)
new_cols = [_clean_col(c) for c in orig_cols]
if new_cols != orig_cols:
    print('Sanitizing feature names: replacing special characters with underscores')
    X_train.columns = new_cols
    X_test.columns = [_clean_col(c) for c in X_test.columns]

print(f"Training: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Testing: {X_test.shape[0]:,} samples")

In [None]:
# Evaluation function (copied from main notebook)
import time
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    results = {
        'model': model_name,
        'train_r2': r2_score(y_train, y_pred_train),
        'test_r2': r2_score(y_test, y_pred_test),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'test_mae': mean_absolute_error(y_test, y_pred_test),
        'test_mdape': np.median(np.abs((y_test - y_pred_test) / y_test)) * 100,
        'train_time': train_time
    }

    print(f"\n{'='*70}")
    print(f"{model_name}")
    print(f"{'='*70}")
    print(f"Train R²: {results['train_r2']:.4f}")
    steph_threshold = 0.884
    if results['test_r2'] > steph_threshold:
        status = '🎉 BEATS STEPH!'
    else:
        status = f"(Gap: {(steph_threshold - results['test_r2']) * 100:.2f}% to Steph)"
    print(f"Test R²:  {results['test_r2']:.4f} {status}")
    print(f"RMSE:     ${results['test_rmse']:,.0f}")
    print(f"MAE:      ${results['test_mae']:,.0f}")
    print(f"MdAPE:    {results['test_mdape']:.2f}%")
    print(f"Time:     {train_time:.1f}s")

    return results, model

# placeholders and container for results
xgb_model = None
advanced_results = []

## Original XGBoost tuning cell (reference)

The cell below is copied verbatim from `04_advanced_models_tuning.ipynb` so you can compare behavior and parameters.

In [None]:
print("Tuning XGBoost (current best: 83.91%)...\n")

xgb_model = None
try:
    from sklearn.model_selection import train_test_split
    import traceback
    import gc
    import numpy as _np

    xgb_param_dist = {
        'n_estimators': [200, 300, 500],
        'learning_rate': [0.01, 0.03, 0.05, 0.1],
        'max_depth': [5, 7, 9, 11],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'gamma': [0, 0.1, 0.2],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0.5, 1, 1.5, 2]
    }

    # Set estimator n_jobs=1 to avoid nested threading with RandomizedSearchCV outer parallelism
    xgb_search = RandomizedSearchCV(
        xgb.XGBRegressor(random_state=42, tree_method='hist', n_jobs=1, use_label_encoder=False, verbosity=0),
        param_distributions=xgb_param_dist,
        n_iter=80,  # keep or reduce for faster dev runs
        cv=3,  # keep 3 for now; switch to 5 for final runs
        scoring='r2',
        random_state=42,
        verbose=2,
        n_jobs=4,  # outer parallelism
        error_score=float('nan')
    )

    # Run the search WITHOUT passing eval_set / early stopping (to avoid leakage & CV weirdness)
    try:
        xgb_search.fit(X_train, y_train)
    except Exception as fit_exc:
        print('RandomizedSearchCV.fit raised an exception during XGBoost search:')
        traceback.print_exc()
    
    # If the search succeeded but produced no valid best estimator, fall back to a smaller, safer search
    best = None
    best_score = getattr(xgb_search, 'best_score_', _np.nan)
    if hasattr(xgb_search, 'best_estimator_') and xgb_search.best_estimator_ is not None and not _np.isnan(best_score):
        best = xgb_search.best_estimator_
        print(f"\nBest XGBoost params: {xgb_search.best_params_}")
        print(f"Best CV R²: {xgb_search.best_score_:.4f}")
    else:
        print('\nXGBoost search did not return a valid best estimator (NaN or exception). Trying a small fallback search...')
        # small, conservative fallback grid to avoid rare failure cases or bad param combos
        try:
            fallback_dist = {
                'n_estimators': [200, 300],
                'learning_rate': [0.03, 0.05],
                'max_depth': [5, 7],
                'min_child_weight': [1, 3],
                'subsample': [0.8, 0.9],
                'colsample_bytree': [0.8, 0.9],
                'reg_alpha': [0, 0.1],
                'reg_lambda': [0.5, 1]
            }
            fallback_search = RandomizedSearchCV(
                xgb.XGBRegressor(random_state=42, tree_method='hist', n_jobs=1, use_label_encoder=False, verbosity=0),
                param_distributions=fallback_dist,
                n_iter=10,
                cv=3,
                scoring='r2',
                random_state=42,
                verbose=2,
                n_jobs=2,
                error_score=float('nan')
            )
            fallback_search.fit(X_train, y_train)
            if hasattr(fallback_search, 'best_estimator_') and fallback_search.best_estimator_ is not None and not _np.isnan(getattr(fallback_search, 'best_score_', _np.nan)):
                best = fallback_search.best_estimator_
                print(f"Fallback best params: {fallback_search.best_params_}")
                print(f"Fallback CV R²: {fallback_search.best_score_:.4f}")
            else:
                print('Fallback search also failed or returned NaN. Will train a safe default XGBoost model.')
        except Exception as fe:
            print('Fallback search failed with exception:')
            traceback.print_exc()
        
    if best is None:
        # Final fallback: train a conservative default XGBoost model to keep pipeline moving
        try:
            print('\nTraining default XGBoost (conservative settings) as last-resort fallback')
            best = xgb.XGBRegressor(random_state=42, n_jobs=1, use_label_encoder=False, verbosity=0,
                                     n_estimators=200, learning_rate=0.05, max_depth=7)
            best.fit(X_train, y_train)
        except Exception as final_exc:
            print('Final fallback training also failed:')
            traceback.print_exc()
            raise final_exc

    # Retrain best estimator with early stopping using a small validation split from the training set
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
    try:
        best.set_params(n_jobs=1, use_label_encoder=False, verbosity=0)
        best.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
    except Exception as retrain_exc:
        print('Retraining best estimator with early stopping failed; proceeding without early stopping for final fit:')
        traceback.print_exc()
        try:
            best.fit(X_train, y_train)
        except Exception as final_fit_exc:
            print('Final best.fit failed:')
            traceback.print_exc()
            raise final_fit_exc

    xgb_results, xgb_model = evaluate_model(best, X_train, X_test, y_train, y_test, "XGBoost (Tuned)")
    advanced_results.append(xgb_results)

    # cleanup
    del xgb_search
    gc.collect()

except Exception as e:
    print('XGBoost tuning failed (outer):', e)
    import traceback
    traceback.print_exc()
    advanced_results.append({'model': 'XGBoost (failed)', 'test_r2': -999})

## Improved XGBoost tuning cell (safer defaults)

This cell uses a narrower search, fewer iterations, and robust fallbacks. `N_ITER` is configurable — reduce it during development to save time.

In [None]:
# Improved XGBoost tuning (safer)
print("Running improved XGBoost tuning...\n")
import gc
from sklearn.model_selection import train_test_split
import traceback

# Config: keep tuning tiny to avoid long runs / kernel problems on Amarel
# Reduce N_ITER aggressively for a fast, safe run. Increase only on an interactive node when needed.
N_ITER = 8  # << small, safe default for quick runs (use 30+ only on an interactive node)
CV_FOLDS = 3  # keep small to save time; increase to 5 for final runs

xgb_model = None
try:
    param_dist_safe = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.03, 0.05],
        'max_depth': [5, 7, 9],
        'min_child_weight': [1, 3],
        'subsample': [0.7, 0.85, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [0.5, 1.0]
    }

    # Estimator: force single-threaded training to avoid nested threading on cluster nodes
    estimator = xgb.XGBRegressor(random_state=42, tree_method='hist', n_jobs=1, use_label_encoder=False, verbosity=0)
    search = RandomizedSearchCV(
    "    estimator,",
    "    param_distributions=param_dist_safe,",
    "    n_iter=N_ITER,",
    "    cv=CV_FOLDS,",
    "    scoring='r2',",
    "    random_state=42,",
    "    verbose=0,  # mute noisy output in cluster runs",
    "    n_jobs=1,  # avoid outer parallelism to prevent kernel crashes on shared nodes",
    "    error_score=float('nan')",
    ")","",
    "try:",
    "    search.fit(X_train, y_train)",
    "except Exception as fit_exc:",
    "    print('Search.fit raised an exception:')",
    "    traceback.print_exc()","",
    "best = None",
    "import numpy as _np",
    "best_score = getattr(search, 'best_score_', _np.nan)",
    "if hasattr(search, 'best_estimator_') and search.best_estimator_ is not None and not _np.isnan(best_score):",
    "    best = search.best_estimator_",
    "    print(f\"Found best params: {search.best_params_}\")",
    "    print(f\"Best CV R²: {search.best_score_:.4f}\")",
    "else:",
    "    print('Primary search did not return a usable best estimator (NaN or exception). Trying a small fallback search...')",
    "    try:",
    "        fallback = RandomizedSearchCV(",
    "            estimator,",
    "            param_distributions={",
    "                'n_estimators': [100, 200],",
    "                'learning_rate': [0.03, 0.05],",
    "                'max_depth': [5, 7],",
    "                'min_child_weight': [1, 3],",
    "                'subsample': [0.8, 0.9],",
    "                'colsample_bytree': [0.8, 0.9],",
    "            },",
    "            n_iter=6, cv=CV_FOLDS, scoring='r2', random_state=42, verbose=0, n_jobs=1, error_score=float('nan')",
    "        )",
    "        fallback.fit(X_train, y_train)",
    "        if hasattr(fallback, 'best_estimator_') and fallback.best_estimator_ is not None and not _np.isnan(getattr(fallback, 'best_score_', _np.nan)):",
    "            best = fallback.best_estimator_",
    "            print(f\"Fallback best params: {fallback.best_params_}\")",
    "            print(f\"Fallback CV R²: {fallback.best_score_:.4f}\")",
    "        else:",
    "            print('Fallback search also failed or returned NaN. Training a conservative default XGBoost model.')",
    "    except Exception as fe:",
    "        print('Fallback failed:')",
    "        traceback.print_exc()","",
    "if best is None:",
    "    print('Training conservative default XGBoost as final fallback')",
    "    best = xgb.XGBRegressor(random_state=42, n_jobs=1, use_label_encoder=False, verbosity=0, n_estimators=150, learning_rate=0.05, max_depth=7)",
    "    best.fit(X_train, y_train)",
",
    "# Retrain best with early stopping on a small validation split",
    "X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)",
    "try:",
    "    best.set_params(n_jobs=1, verbosity=0)",
    "    best.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=False)",
    "except Exception as retrain_exc:",
    "    print('Retrain with early stopping failed, falling back to full-train:')",
    "    traceback.print_exc()",
    "    try:",
    "        best.fit(X_train, y_train)",
    "    except Exception as final_fit_exc:",
    "        print('Final fit failed:')",
    "        traceback.print_exc()",
    "        raise final_fit_exc","",
    "# Evaluate and save",
    "xgb_results, xgb_model = evaluate_model(best, X_train, X_test, y_train, y_test, 'XGBoost (Improved Tuned)')",
    "advanced_results.append(xgb_results)",

    "# Save model and results",
    "joblib.dump(xgb_model, MODELS_DIR / 'best_xgb_tuned.joblib')",
    "pd.DataFrame(advanced_results).to_csv(MODELS_DIR / 'xgb_tuning_results.csv', index=False)",

    "# cleanup",
    "del search",
    "gc.collect()","",
except Exception as e:
    print('Improved XGBoost tuning cell failed:')
    traceback.print_exc()
    advanced_results.append({'model': 'XGBoost (failed)', 'test_r2': -999})

In [None]:
# Summary print
if len(advanced_results) > 0:
    df = pd.DataFrame(advanced_results).sort_values('test_r2', ascending=False)
    print('\nXGBoost tuning summary:')
    print(df.to_string(index=False))
    print('Models saved to', MODELS_DIR)
else:
    print('No results recorded in advanced_results')