XGBoost for Regression

In [None]:
%pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Create synthetic regression data
X, y = make_regression(n_samples=12000, n_features=208, noise=0.2, random_state=42)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train regressor
regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse:.4f}")

Mean Squared Error: 513.5613
Root Mean Squared Error: 22.6619


XGBoost with Hyperparametric Tuning: 
RandomizedSearchCV to narrow the region, then GridSearchCV in that narrowed region for fine-tuning

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import joblib
from scipy.stats import uniform, randint

# RandomizedSearchCV: Broad search
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

random_search = RandomizedSearchCV(
    xgb_reg, param_distributions=param_dist,
    n_iter=20, scoring='neg_mean_squared_error', cv=3, verbose=1, random_state=42
)
random_search.fit(X_train, y_train)

print("Best params from RandomizedSearchCV:", random_search.best_params_)

# GridSearchCV: Fine-tune in narrowed region
narrowed_params = {
    'n_estimators': [random_search.best_params_['n_estimators'] - 20, random_search.best_params_['n_estimators'], random_search.best_params_['n_estimators'] + 20],
    'learning_rate': [random_search.best_params_['learning_rate'] * f for f in [0.8, 1.0, 1.2]],
    'max_depth': [max(1, random_search.best_params_['max_depth'] - 1), random_search.best_params_['max_depth'], random_search.best_params_['max_depth'] + 1],
}

grid_search = GridSearchCV(
    xgb_reg, param_grid=narrowed_params,
    scoring='neg_mean_squared_error', cv=3, verbose=1
)
grid_search.fit(X_train, y_train)

print("Best params from GridSearchCV:", grid_search.best_params_)

# Evaluate on test data
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Final Test MSE: {mse:.4f}")
print(f"Final Test RMSE: {np.sqrt(mse):.4f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits


Incase of High Dimensional data: Using PCA to reduce dimensions

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Pipeline: StandardScaler -> PCA -> XGBoost
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),  # Reduce dimensions to prevent overfitting
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Grid Search on selected hyperparameters
param_grid = {
    'pca__n_components': [5, 10, 15],  # Ensure n_components <= min(n_samples, n_features)
    'xgb__n_estimators': [50, 286],
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.15],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {np.sqrt(mse):.4f}")
print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Test MSE: 20334.6999
Test RMSE: 142.5998
Best Parameters: {'pca__n_components': 15, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 3, 'xgb__n_estimators': 286}


Save the model

In [None]:
# Save the model
joblib.dump(grid_search.best_estimator_, "xgb_boston_model.pkl")
print("Model saved to xgb_boston_model.pkl")

Use the model in future

In [None]:
import joblib
from sklearn.metrics import mean_squared_error

# Load model
model = joblib.load("xgb_boston_model.pkl")

# Load data
# X, y

# Predict
preds = model.predict(X)

# Evaluate
mse = mean_squared_error(y, preds)
print(f"MSE from loaded model: {mse:.4f}")
print(f"RMSE from loaded model: {np.sqrt(mse):.4f}")