# XGBoost Model Development With Optuna Optimization

# Import Libraries and Root Configuration

In [1]:
""" Configure the utilities module path for imports """
import sys
import os
from pathlib import Path

# get project root as parent of current working directory
project_root = Path(os.getcwd()).parent

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
""" Import libraries to develop XGBoost model """
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from src.utilities import StockDataProcessor, Evaluator, ModelPersister
import joblib

# Feature and Training Setup

## Artifacts Setup

In [3]:
# read dataset from file
file = Path(r'../data/AAPL_preprocessed.csv')
data = pd.read_csv(file)

In [4]:
# split data into train, validation, and test sets
train, test = StockDataProcessor.time_based_split(data)

y_train_data = train['Close'].copy()
y_test_data = test['Close'].copy()

## Prepare ML data

In [5]:
# function to extract train and test data
def prepare_ml_data(y_train, lags=10, test_size=0.2, random_state=42):
    processor = StockDataProcessor()
    df = processor.create_lagged_features(y_train, lags)
    
    # Split data into train and test sets (80/20)
    x = df.iloc[:, 1:].copy()
    y = df.iloc[:, 0].copy()
    
    x_train, x_test, y_train, y_test = train_test_split(
        x.values, y.values, test_size=test_size, shuffle=False, random_state=random_state
    )
    
    return x_train, y_train, x_test, y_test

In [6]:
# Example usage
x_train, y_train, x_test, y_test = prepare_ml_data(y_train_data)

In [7]:
# XGBoost objective function for Optuna optimization
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'verbosity': 0
    }

    # Create and train the model
    model = xgb.XGBRegressor(**params)
    model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=False)

    # Evaluate performance on test data
    y_pred = model.predict(x_test)
    return mean_squared_error(y_test, y_pred)


# Model Training with Optuna Optimization

In [8]:
# create optuna study and optimize hyperparameters
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction='minimize')
study.optimize(objective_xgb, n_trials=50, show_progress_bar=True)
best_params = study.best_params

  0%|          | 0/50 [00:00<?, ?it/s]

Best trial: 9. Best value: 3545.33: 100%|██████████| 50/50 [00:24<00:00,  2.00it/s]


In [9]:
# fit the best model
model = xgb.XGBRegressor(**best_params, random_state=42)
model.fit(x_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9807161176809884
,device,
,early_stopping_rounds,
,enable_categorical,False


## Apply Model to make predictions

In [10]:
train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

## Evaluating The Model Performance

In [11]:
# return MSE, MAE, RMSE, R2 and MAPE results as a list
train_metrics = Evaluator.calculate_metrics(y_train, train_pred)
test_metrics = Evaluator.calculate_metrics(y_test, test_pred)

In [12]:
# unpack metrics
train_mse, train_mae, train_rmse, train_r2, train_mape = train_metrics
test_mse, test_mae, test_rmse, test_r2, test_mape = test_metrics

In [13]:
model_performance = Evaluator.print_evaluation_tables("XGBoost", train_metrics, test_metrics)
model_performance

--- Performance Comparison: Train vs Test (XGBoost) ---


Unnamed: 0,Metric,Training,Test
0,MSE,0.278,3545.334
1,MAE,0.347,53.843
2,RMSE,0.528,59.543
3,R2 Score,0.999,-4.367
4,MAPE,1.153,38.446


# Cross-Validation, Summary and Overfitting Analysis

## Cross Validation with TimeSeriesSplit

In [14]:
# TimeSeriesSplit CV on training set as diagnostic.
tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(x_train), start=1):
    x_tr, x_tst = x_train[train_idx], x_train[test_idx]
    y_tr, y_tst = y_train[train_idx], y_train[test_idx]

    model = xgb.XGBRegressor(**best_params, random_state=42)
    model.fit(x_tr, y_tr)

    preds = model.predict(x_tst)
    cv_metrics = Evaluator.calculate_metrics(y_tst, preds)

In [15]:
# unpack metrics
cv_mse, cv_mae, cv_rmse, cv_r2, cv_mape = cv_metrics

# cross validation performance
model_cv = pd.DataFrame({
    'Model': ['XGBoost'],
    'CV_MSE': [cv_mse],
    'CV_MAE': [cv_mae],
    'CV_RMSE':[cv_rmse],
    'CV_R2': [cv_r2],
    'CV_MAPE': [cv_mape]
}).round(3)

In [16]:
print("Cross-Validation Metrics (Training folds):")
model_cv

Cross-Validation Metrics (Training folds):


Unnamed: 0,Model,CV_MSE,CV_MAE,CV_RMSE,CV_R2,CV_MAPE
0,XGBoost,102.716,6.307,10.135,0.283,9.441


## Summary of the Model Performance

In [17]:
perf_summary = pd.DataFrame({
    'Metrics' : ['MSE', 'MAE', 'RMSE', 'R2-Score', 'MAPE'],
    'Train': train_metrics,
    'CV': cv_metrics,
    'Test': test_metrics
}).round(3)

In [18]:
print("=== Summary of The Model Evaluation ===")
perf_summary

=== Summary of The Model Evaluation ===


Unnamed: 0,Metrics,Train,CV,Test
0,MSE,0.278,102.716,3545.334
1,MAE,0.347,6.307,53.843
2,RMSE,0.528,10.135,59.543
3,R2-Score,0.999,0.283,-4.367
4,MAPE,1.153,9.441,38.446


## Overfitting Analysis

In [19]:
# Overfitting analysis (compare CV_RMSE to Test_RMSE)

overfit = {
    'Model': 'XGBoost',
    'CV_RMSE': float(cv_rmse),
    'Test_RMSE': float(test_rmse),
    'RMSE_Increase': float(test_rmse - cv_rmse) if (not np.isnan(cv_rmse) and not np.isnan(test_rmse)) else np.nan,
    'Overfitting_Ratio': float(test_rmse / (cv_rmse + 1e-8)) if not np.isnan(cv_rmse) else np.nan
}

overfit_df = pd.DataFrame([overfit]).round(3)

In [20]:
print("=== Overfitting Analysis (XGBoost Model) ===")
overfit_df

=== Overfitting Analysis (XGBoost Model) ===


Unnamed: 0,Model,CV_RMSE,Test_RMSE,RMSE_Increase,Overfitting_Ratio
0,XGBoost,10.135,59.543,49.408,5.875


In [23]:
# aggrageted model performance
agg_perf = pd.DataFrame({
    'Model': ['XGBoost'],
    'Test MAE' : test_mae,
    'Test R2-Score': test_r2,
    'Test MAPE' : test_mape,
    'CV MAE' : cv_mae,
    'CV R2' : cv_r2,
    'CV MAPE' : cv_mape,
    'RMSE Increase' : overfit.get('RMSE_Increase', np.nan),
    'Overfitting Ratio' : overfit.get('Overfitting_Ratio', np.nan)
}).round(3)

In [24]:
agg_perf

Unnamed: 0,Model,Test MAE,Test R2-Score,Test MAPE,CV MAE,CV R2,CV MAPE,RMSE Increase,Overfitting Ratio
0,XGBoost,53.843,-4.367,38.446,6.307,0.283,9.441,49.408,5.875


# Model Performance and Persistence

In [25]:
# model persistor object
persister = ModelPersister(model_name="XGBoost")

In [26]:
# aggregate model performance
persister.aggregated_performance(agg_perf)

EmptyDataError: No columns to parse from file

In [27]:
# save ariXGBoost model performance
persister.save_performance(perf_summary)

XGBoost performance saved: ..\artifacts\model-performance\xgboostPerformance.csv


In [28]:
# save overfitting analysis
persister.append_overfitting(overfit_df)

EmptyDataError: No columns to parse from file

In [29]:
# save model
persister.save_model(model)

Model saved: ..\artifacts\models/xgboost.pkl
