XGBoost for Regression

In [None]:
%pip install xgboost

In [3]:
import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Create synthetic regression data
X, y = make_regression(n_samples=1000, n_features=208, noise=0.2, random_state=42)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train regressor
regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse:.4f}")

Mean Squared Error: 5263.2106
Root Mean Squared Error: 72.5480


XGBoost with Hyperparametric Tuning: 
RandomizedSearchCV to narrow the region, then GridSearchCV in that narrowed region for fine-tuning

In [4]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import joblib
from scipy.stats import uniform, randint

# RandomizedSearchCV: Broad search
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

random_search = RandomizedSearchCV(
    xgb_reg, param_distributions=param_dist,
    n_iter=20, scoring='neg_mean_squared_error', cv=3, verbose=1, random_state=42
)
random_search.fit(X_train, y_train)

print("Best params from RandomizedSearchCV:", random_search.best_params_)

# GridSearchCV: Fine-tune in narrowed region
narrowed_params = {
    'n_estimators': [random_search.best_params_['n_estimators'] - 20, random_search.best_params_['n_estimators'], random_search.best_params_['n_estimators'] + 20],
    'learning_rate': [random_search.best_params_['learning_rate'] * f for f in [0.8, 1.0, 1.2]],
    'max_depth': [max(1, random_search.best_params_['max_depth'] - 1), random_search.best_params_['max_depth'], random_search.best_params_['max_depth'] + 1],
}

grid_search = GridSearchCV(
    xgb_reg, param_grid=narrowed_params,
    scoring='neg_mean_squared_error', cv=3, verbose=1
)
grid_search.fit(X_train, y_train)

print("Best params from GridSearchCV:", grid_search.best_params_)

# Evaluate on test data
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Final Test MSE: {mse:.4f}")
print(f"Final Test RMSE: {np.sqrt(mse):.4f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params from RandomizedSearchCV: {'colsample_bytree': 0.5370223258670452, 'learning_rate': 0.11753971856328177, 'max_depth': 3, 'n_estimators': 77, 'subsample': 0.9315517129377968}
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best params from GridSearchCV: {'learning_rate': 0.14104766227593812, 'max_depth': 2, 'n_estimators': 97}
Final Test MSE: 2098.7017
Final Test RMSE: 45.8116


Incase of High Dimensional data: Using PCA to reduce dimensions

In [5]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Pipeline: StandardScaler -> PCA -> XGBoost
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),  # Reduce dimensions to prevent overfitting
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Grid Search on selected hyperparameters
param_grid = {
    'pca__n_components': [5, 10, 15],  # Ensure n_components <= min(n_samples, n_features)
    'xgb__n_estimators': [50, 286],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.15],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {np.sqrt(mse):.4f}")
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Test MSE: 28025.3628
Test RMSE: 167.4078
Best Parameters: {'pca__n_components': 15, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 286}


Using Support Vectors

In [6]:
from sklearn.svm import SVR

# Build the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('svr', SVR())
])

# Define hyperparameters to tune
param_grid = {
    'pca__n_components': [30, 50, 80],
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.01, 0.1, 0.5],
    'svr__kernel': ['rbf']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"SVR Test MSE: {mse:.4f}")
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
SVR Test MSE: 26973.6954
Best Parameters: {'pca__n_components': 30, 'svr__C': 10, 'svr__epsilon': 0.01, 'svr__kernel': 'rbf'}


Using ElasticNet

In [9]:
from sklearn.linear_model import ElasticNet

# Build the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('elastic', ElasticNet(max_iter=10000))
])

# Define hyperparameters to tune
param_grid = {
    'pca__n_components': [30, 50, 80],
    'elastic__alpha': [0.01, 0.1, 1.0],
    'elastic__l1_ratio': [0.2, 0.5, 0.8],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"ElasticNet Test MSE: {mse:.4f}")
print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
ElasticNet Test MSE: 18678.2579
Best Parameters: {'elastic__alpha': 0.01, 'elastic__l1_ratio': 0.2, 'pca__n_components': 80}


Save the model

In [7]:
import joblib

# Save the model
joblib.dump(grid_search.best_estimator_, "xgb_regression_model.pkl")
print("Model saved to xgb_regression_model.pkl")

Model saved to xgb_regression_model.pkl


KFold for Class Imbalance

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold

# Base model for feature selection
base_selector_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

# Define pipeline: Standardization -> Feature Selection -> XGBoost
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_select', SelectFromModel(base_selector_model, threshold="median")),
    ('xgb', xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42
    ))
])

# K-Fold Cross-validation
kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameter tuning grid
param_grid = {
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kfold_cv, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {np.sqrt(mse):.4f}")
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Test MSE: 2140.5240
Test RMSE: 46.2658
Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 150}


SMOTE for Class Imbalance

In [11]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
import pandas as pd

# Bin the target variable to simulate stratification and enable SMOTE
bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
y_binned = bins.fit_transform(y.reshape(-1, 1)).ravel()

# Train/test split
X_train, X_test, y_train, y_test, y_binned_train, _ = train_test_split(
    X, y, y_binned, test_size=0.2, stratify=y_binned, random_state=42
)

# Apply SMOTE on binned targets
sm = SMOTE(random_state=42)
X_resampled, y_binned_resampled = sm.fit_resample(X_train, y_binned_train)

# Recover original regression targets after SMOTE using index mapping
# Map from binned label back to original y
# Note: Here we average y_train within each bin to approximate back
bin_means = pd.Series(y_train).groupby(y_binned_train).mean()
y_resampled = [bin_means[int(lbl)] for lbl in y_binned_resampled]
y_resampled = np.array(y_resampled)

# Pipeline with Feature Selection and XGBoost Regressor
base_selector_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_select', SelectFromModel(base_selector_model, threshold="median")),
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# KFold Cross-Validation (not stratified because regression, but 5-fold)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search
param_grid = {
    'xgb__n_estimators': [100, 150],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_resampled, y_resampled)

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {np.sqrt(mse):.4f}")
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Test MSE: 2626.3936
Test RMSE: 51.2484
Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 150}


Use the model in future

In [8]:
import joblib
from sklearn.metrics import mean_squared_error

# Load model
model = joblib.load("xgb_regression_model.pkl")

# Load data
# X, y

# Predict
preds = model.predict(X)

# Evaluate
mse = mean_squared_error(y, preds)
print(f"MSE from loaded model: {mse:.4f}")
print(f"RMSE from loaded model: {np.sqrt(mse):.4f}")

MSE from loaded model: 26048.7442
RMSE from loaded model: 161.3962
