In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
import pandas as pd

In [2]:
df2 = pd.read_csv('cleaned_data.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    df2.drop(columns='Sales (Global Ultimate Total USD)'),
    df2['Sales (Global Ultimate Total USD)'],
    test_size=0.10,
    random_state=42
  )

In [4]:
model = GradientBoostingRegressor()

model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

3.016186534053956e+20

In [6]:
model.score(X_test, y_test)

0.2645059277434969

# Hyperparameter Tuning

In [8]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m92.2/100.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.12.0 scikit-optimize-0.9.0


In [27]:
from skopt import BayesSearchCV

search_space = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.000001, 0.0001, 0.1, 1, 10],
    'n_estimators': [50, 100, 125, 150, 175, 200, 225, 250, 275, 300],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [10, 50, 100, 125, 150, 200, 250, 300, 350, 400],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60, 70]
}

opt = BayesSearchCV(
    GradientBoostingRegressor(),
    search_space,
    n_iter=5,
    verbose=3,
    cv=3
)

In [28]:
opt.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END criterion=squared_error, learning_rate=10.0, loss=absolute_error, max_depth=6, min_samples_split=100, n_estimators=100;, score=-68741727578441240211087196677096058171694260898629917701710430430893060588897835171147130551258789227618115499541167479482195945051132401364781647446998006205234800076831661725272443453440.000 total time=   6.0s
[CV 2/3] END criterion=squared_error, learning_rate=10.0, loss=absolute_error, max_depth=6, min_samples_split=100, n_estimators=100;, score=-7377039634629066078539971403853541341202904377730660578318440364295679407926477401783750948880119822265292665751601339885229129720711307914102305534838436370345060595305605782881653751808.000 total time=   5.3s
[CV 3/3] END criterion=squared_error, learning_rate=10.0, loss=absolute_error, max_depth=6, min_samples_split=100, n_estimators=100;, score=-52973824597113087130999039371581203589067255919896743452580413118374449430577070109278843264265

  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


[CV 1/3] END criterion=friedman_mse, learning_rate=1.0, loss=quantile, max_depth=10, min_samples_split=300, n_estimators=50;, score=-2.483 total time=   4.8s
[CV 2/3] END criterion=friedman_mse, learning_rate=1.0, loss=quantile, max_depth=10, min_samples_split=300, n_estimators=50;, score=-1.430 total time=   5.3s
[CV 3/3] END criterion=friedman_mse, learning_rate=1.0, loss=quantile, max_depth=10, min_samples_split=300, n_estimators=50;, score=-1.861 total time=   4.9s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


[CV 1/3] END criterion=friedman_mse, learning_rate=0.0001, loss=huber, max_depth=9, min_samples_split=200, n_estimators=100;, score=-0.034 total time=  11.3s
[CV 2/3] END criterion=friedman_mse, learning_rate=0.0001, loss=huber, max_depth=9, min_samples_split=200, n_estimators=100;, score=-0.029 total time=  11.6s
[CV 3/3] END criterion=friedman_mse, learning_rate=0.0001, loss=huber, max_depth=9, min_samples_split=200, n_estimators=100;, score=-0.030 total time=  11.0s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


[CV 1/3] END criterion=friedman_mse, learning_rate=0.1, loss=absolute_error, max_depth=8, min_samples_split=250, n_estimators=150;, score=0.165 total time=  13.0s
[CV 2/3] END criterion=friedman_mse, learning_rate=0.1, loss=absolute_error, max_depth=8, min_samples_split=250, n_estimators=150;, score=0.218 total time=  14.8s
[CV 3/3] END criterion=friedman_mse, learning_rate=0.1, loss=absolute_error, max_depth=8, min_samples_split=250, n_estimators=150;, score=0.100 total time=  12.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits


  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


[CV 1/3] END criterion=friedman_mse, learning_rate=10.0, loss=quantile, max_depth=40, min_samples_split=350, n_estimators=225;, score=-inf total time=  14.8s


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)


[CV 2/3] END criterion=friedman_mse, learning_rate=10.0, loss=quantile, max_depth=40, min_samples_split=350, n_estimators=225;, score=-inf total time=  16.1s


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
             -inf]
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


[CV 3/3] END criterion=friedman_mse, learning_rate=10.0, loss=quantile, max_depth=40, min_samples_split=350, n_estimators=225;, score=-inf total time=  14.6s


In [20]:
opt.best_params_

OrderedDict([('criterion', 'squared_error'),
             ('learning_rate', 0.1),
             ('loss', 'squared_error'),
             ('max_depth', 8),
             ('min_samples_split', 300),
             ('n_estimators', 250)])

In [21]:
model_final = GradientBoostingRegressor(
    criterion='squared_error',
    learning_rate=0.1,
    loss='squared_error',
    max_depth=8,
    min_samples_split=300,
    n_estimators=250
)

In [22]:
model_final.fit(X_train, y_train)

In [33]:
y_pred = model_final.predict(X_test)
mean_squared_error(y_test, y_pred)

2.13583190761426e+20

In [35]:
import joblib

# Save the base model to an HDF5 file
joblib.dump(model_final, 'final_model.h5')

['final_model.h5']