In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from make_datasets_w_categorical import make_datasets_w_categorical
from xgboost.sklearn import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import itertools
from sklearn.model_selection import KFold

In [2]:
### TABULARIZE TIME SERIES SALES DATA
data = pd.read_csv('norway_new_car_sales_by_make1.csv')
data['Date'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2)

df = pd.pivot_table(data=data, values='Quantity', index='Make', columns='Date', aggfunc='sum', fill_value=0)

# add the car brand as encoded categorical column - this is one main power of ML models; they can include more features like exogenous information etc.
df['Brand'] = df.index.astype('category').codes

In [3]:
X_train, y_train, X_test, y_test = make_datasets_w_categorical(df, x_len=24, y_len=12, test_loops=12, cat_names=['Brand'])
print('The training set has {} rows and {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('The test set has {} rows and {} columns'.format(X_test.shape[0], X_test.shape[1]))
print('The target vector consists of {} values meaning that the model predicts one year ahead'.format(y_train.shape[1]))
print(X_train)
print(y_train)

print(X_test)
print(y_test)

The training set has 4810 rows and 25 columns
The test set has 780 rows and 25 columns
The target vector consists of 12 values meaning that the model predicts one year ahead
[[   0   16    9 ...    9    7    7]
 [   1    0    0 ...    0    0    0]
 [   2  599  498 ...  578  522  625]
 ...
 [  62 1592 1440 ... 1920 2019 2057]
 [  63  826  826 ...  950 2072  321]
 [  64    0    0 ...    0    0    0]]
[[   6    2    9 ...    0    4    0]
 [   0    0    0 ...    0    0    0]
 [ 221  325  323 ...  510  549  677]
 ...
 [1895 2274 2667 ... 2346 1881 1743]
 [ 438  875  729 ...  937 1512  643]
 [   0    0    0 ...    0    0    0]]
[[   0    9   13 ...    1    9    1]
 [   1    0    1 ...    0    0    0]
 [   2  488  987 ...  604  302  474]
 ...
 [  62 1379 1722 ... 2346 1881 1743]
 [  63  723  915 ...  937 1512  643]
 [  64    0    0 ...    0    0    0]]
[[   3    3    3 ...    3    0    2]
 [   0    2    2 ...    0    0    0]
 [ 598  665  585 ...  496  559  531]
 ...
 [2044 2236 3017 ... 2106 

During development there was a problem with the scikit-learn cross-validation library that was used for the random forest model.
It was not so straightfoward to combine the library with an xgboostregressor that produces a vector of predictions (12 values for the upcoming year).
Thus, the KFold module was used to implement our cross-validation procedure.

Read about KFold split: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [4]:
##############    SHOWCASE KFOLD SPLIT

kf = KFold(n_splits=3)
for i, (train_index, test_index) in enumerate(kf.split(X_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    print(len(train_index))
    print(len(test_index))


Fold 0:
  Train: index=[1604 1605 1606 ... 4807 4808 4809]
  Test:  index=[   0    1    2 ... 1601 1602 1603]
3206
1604
Fold 1:
  Train: index=[   0    1    2 ... 4807 4808 4809]
  Test:  index=[1604 1605 1606 ... 3204 3205 3206]
3207
1603
Fold 2:
  Train: index=[   0    1    2 ... 3204 3205 3206]
  Test:  index=[3207 3208 3209 ... 4807 4808 4809]
3207
1603


In [5]:
### hyperparameters
max_depth = [5,7,10,15, None]
eta = [0.001, 0.005, 0.01, 0.05, 0.1]
subsample = [0.5, 0.6, 0.7, 0.8]
colsample_bytree = [0.4, 0.5, 0.6, 0.7, 0.8]
min_child_weight = [1,3,5,8,10]
num_boost_round = [50,100,200,300]

In [6]:
xgb_results= pd.DataFrame(columns=['max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'min_child_weight', 'num_boost_round','Avg MAE', 'Avg RMSE'])

### IMPORTANT NOTE: The procedure is really long-running, so if you want to reproduce it better let it run during the night

In [None]:
for hparameter in list(itertools.product(max_depth, eta, subsample, colsample_bytree, min_child_weight, num_boost_round)):

    try:

        hyperparameters_maes = []
        hyperparameters_rmses = []
        kf = KFold(n_splits=3, shuffle=True, random_state=33)

        for i, (train_index, test_index) in enumerate(kf.split(X_train)):

           dtrain = xgb.DMatrix(data=X_train[train_index], label=y_train[train_index])
           dtest = xgb.DMatrix(data=X_train[test_index], label=y_train[test_index])
           watchlist = [(dtrain, 'train')]
           xgb_model = xgb.train({
                                 'max_depth' : hparameter[0],
                                 'eta' :  hparameter[1],
                                 'subsample' : hparameter[2],
                                 'colsample_bytree' : hparameter[3],
                                 'min_child_weight' : hparameter[4],
                                 'num_boost_round' : hparameter[5],
                                 'num_target' : y_train.shape[1],
                                 'tree_method': 'hist',
                                 'eval_metric': ['rmse', 'mae'],
                                 'nthread': 8,
                                 'seed': 33,
                                 'verbosity': 1
                               },
                               dtrain,
                               evals=watchlist)

           preds = xgb_model.predict(dtest).astype(int)

           mae = np.mean(np.abs(preds - y_train[test_index]))
           rmse = np.sqrt(np.mean((preds - y_train[test_index])**2))
           hyperparameters_maes.append(mae)
           hyperparameters_rmses.append(rmse)

        # incorporate results to df
        xgb_results_append = pd.DataFrame(index=range(1),columns=['max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'min_child_weight', 'num_boost_round','Avg MAE', 'Avg RMSE'])
        xgb_results_append.loc[:, 'max_depth'] = hparameter[0]
        xgb_results_append.loc[:, 'learning_rate'] = hparameter[1]
        xgb_results_append.loc[:, 'subsample'] = hparameter[2]
        xgb_results_append.loc[:, 'colsample_bytree'] = hparameter[3]
        xgb_results_append.loc[:, 'min_child_weight'] = hparameter[4]
        xgb_results_append.loc[:, 'num_boost_round'] = hparameter[5]
        xgb_results_append.loc[:, 'Avg MAE'] = np.mean(hyperparameters_maes)
        xgb_results_append.loc[:, 'Avg RMSE'] = np.mean(hyperparameters_rmses)

        xgb_results = xgb_results.append(xgb_results_append)

    except:
        continue

Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:388.89082	train-mae:175.83695
[1]	train-rmse:388.53695	train-mae:175.66743
[2]	train-rmse:388.18144	train-mae:175.49764
[3]	train-rmse:387.82928	train-mae:175.32887
[4]	train-rmse:387.47795	train-mae:175.16038
[5]	train-rmse:387.12672	train-mae:174.99179
[6]	train-rmse:386.77420	train-mae:174.82274
[7]	train-rmse:386.41792	train-mae:174.65291
[8]	train-rmse:386.06769	train-mae:174.48513
[9]	train-rmse:385.71611	train-mae:174.31735
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:387.84340	train-mae:176.20647
[1]	train-rmse:387.49039	train-mae:176.03604
[2]	train-rmse:387.13850	train-mae:175.86685
[3]	train-rmse:386.78582	train-mae:175.69707
[4]	train-rmse:386.43187	train-mae:175.52687
[5]	train-rmse:386.08138	train-mae:175.35787
[6]	train-rmse:385.73188	train-mae:175.18901
[7]	train-rmse:385.38332	train-mae:175.02061
[8]	train-rmse:385.03216	train-mae:174.85152
[9]	train-rmse:384.68187	train-mae:174.68241
Pa

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[8]	train-rmse:373.57591	train-mae:168.51224
[9]	train-rmse:371.87283	train-mae:167.70120
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:386.43520	train-mae:175.52478
[1]	train-rmse:384.67495	train-mae:174.67507
[2]	train-rmse:382.92452	train-mae:173.83326
[3]	train-rmse:381.17109	train-mae:172.99177
[4]	train-rmse:379.42254	train-mae:172.15218
[5]	train-rmse:377.70236	train-mae:171.32271
[6]	train-rmse:375.99394	train-mae:170.49619
[7]	train-rmse:374.29424	train-mae:169.67739
[8]	train-rmse:372.58703	train-mae:168.85802
[9]	train-rmse:370.89216	train-mae:168.04259
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:380.01072	train-mae:171.39466
[1]	train-rmse:378.29132	train-mae:170.56983
[2]	train-rmse:376.55925	train-mae:169.74253
[3]	train-rmse:374.84729	train-mae:168.92262
[4]	train-rmse:373.15585	train-mae:168.11000
[5]	train-rmse:371.46855	train-mae:167.30035
[6]	train-rmse:369.78623	train-mae:166.49571
[7]	train-rmse:368.11093	train-mae:165.69289
[8

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[0]	train-rmse:347.13162	train-mae:155.65236
[1]	train-rmse:315.92749	train-mae:140.78716
[2]	train-rmse:287.62399	train-mae:127.27830
[3]	train-rmse:262.24424	train-mae:115.12919
[4]	train-rmse:239.83574	train-mae:104.29515
[5]	train-rmse:219.57421	train-mae:94.54772
[6]	train-rmse:201.41862	train-mae:85.87088
[7]	train-rmse:185.11372	train-mae:78.11732
[8]	train-rmse:170.51262	train-mae:71.22370
[9]	train-rmse:157.37473	train-mae:65.07928
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:353.67828	train-mae:159.05693
[1]	train-rmse:321.91181	train-mae:143.85073
[2]	train-rmse:293.17919	train-mae:130.15815
[3]	train-rmse:267.36204	train-mae:117.85151
[4]	train-rmse:244.34559	train-mae:106.80955
[5]	train-rmse:223.66401	train-mae:96.86336
[6]	train-rmse:205.02160	train-mae:87.94094
[7]	train-rmse:188.10057	train-mae:79.96735
[8]	train-rmse:173.02987	train-mae:72.90585
[9]	train-rmse:159.69975	train-mae:66.68022
Parameters: { "num_boost_round" } are not used.

[0]	train-rm

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[3]	train-rmse:381.07609	train-mae:172.98038
[4]	train-rmse:379.31611	train-mae:172.14101
[5]	train-rmse:377.56543	train-mae:171.30614
[6]	train-rmse:375.82590	train-mae:170.47413
[7]	train-rmse:374.10488	train-mae:169.65232
[8]	train-rmse:372.37944	train-mae:168.83044
[9]	train-rmse:370.66869	train-mae:168.01299
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:379.99315	train-mae:171.39416
[1]	train-rmse:378.24294	train-mae:170.56426
[2]	train-rmse:376.50399	train-mae:169.73836
[3]	train-rmse:374.76540	train-mae:168.91268
[4]	train-rmse:373.04691	train-mae:168.09548
[5]	train-rmse:371.33245	train-mae:167.28157
[6]	train-rmse:369.63133	train-mae:166.47287
[7]	train-rmse:367.93459	train-mae:165.66671
[8]	train-rmse:366.24607	train-mae:164.86592
[9]	train-rmse:364.56455	train-mae:164.06909
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:387.45030	train-mae:175.15526
[1]	train-rmse:385.66335	train-mae:174.30754
[2]	train-rmse:383.87804	train-mae:173.46089
[3

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[0]	train-rmse:371.31597	train-mae:167.47782
[1]	train-rmse:354.37949	train-mae:159.39849
[2]	train-rmse:338.21616	train-mae:151.71908
[3]	train-rmse:322.98191	train-mae:144.46601
[4]	train-rmse:308.50525	train-mae:137.55415
[5]	train-rmse:294.70099	train-mae:130.98908
[6]	train-rmse:281.55380	train-mae:124.70769
[7]	train-rmse:268.91870	train-mae:118.71417
[8]	train-rmse:257.12946	train-mae:113.09454
[9]	train-rmse:245.94631	train-mae:107.75475
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:370.34222	train-mae:167.80555
[1]	train-rmse:353.37150	train-mae:159.68492
[2]	train-rmse:337.36136	train-mae:152.01211
[3]	train-rmse:322.11827	train-mae:144.72260
[4]	train-rmse:307.55312	train-mae:137.74705
[5]	train-rmse:293.80091	train-mae:131.14135
[6]	train-rmse:280.64825	train-mae:124.82555
[7]	train-rmse:268.28492	train-mae:118.90233
[8]	train-rmse:256.51130	train-mae:113.24714
[9]	train-rmse:245.43558	train-mae:107.92287
Parameters: { "num_boost_round" } are not used.

[0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[7]	train-rmse:181.27571	train-mae:77.24859
[8]	train-rmse:166.21268	train-mae:70.16954
[9]	train-rmse:152.54728	train-mae:63.80078
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:353.24516	train-mae:158.98274
[1]	train-rmse:320.81353	train-mae:143.64223
[2]	train-rmse:291.60441	train-mae:129.81225
[3]	train-rmse:265.35037	train-mae:117.40201
[4]	train-rmse:241.89276	train-mae:106.24492
[5]	train-rmse:220.79430	train-mae:96.24420
[6]	train-rmse:201.72390	train-mae:87.20471
[7]	train-rmse:184.41674	train-mae:79.04709
[8]	train-rmse:169.03660	train-mae:71.77424
[9]	train-rmse:155.25144	train-mae:65.29490
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:352.26826	train-mae:159.26579
[1]	train-rmse:319.97277	train-mae:143.88236
[2]	train-rmse:290.96925	train-mae:130.07638
[3]	train-rmse:264.79669	train-mae:117.63746
[4]	train-rmse:241.35330	train-mae:106.41411
[5]	train-rmse:220.28466	train-mae:96.34084
[6]	train-rmse:201.20272	train-mae:87.23726
[7]	train-rm

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[9]	train-rmse:384.61328	train-mae:174.66647
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:381.39497	train-mae:172.06112
[1]	train-rmse:381.04118	train-mae:171.89333
[2]	train-rmse:380.68898	train-mae:171.72623
[3]	train-rmse:380.33703	train-mae:171.55903
[4]	train-rmse:379.98624	train-mae:171.39244
[5]	train-rmse:379.63605	train-mae:171.22578
[6]	train-rmse:379.28579	train-mae:171.05922
[7]	train-rmse:378.93521	train-mae:170.89252
[8]	train-rmse:378.58547	train-mae:170.72654
[9]	train-rmse:378.23472	train-mae:170.56019
Parameters: { "num_boost_round" } are not used.

[0]	train-rmse:388.88420	train-mae:175.83541
[1]	train-rmse:388.52362	train-mae:175.66449
[2]	train-rmse:388.16328	train-mae:175.49354
[3]	train-rmse:387.80396	train-mae:175.32313
[4]	train-rmse:387.44605	train-mae:175.15317
[5]	train-rmse:387.08894	train-mae:174.98345
[6]	train-rmse:386.73057	train-mae:174.81336
[7]	train-rmse:386.37102	train-mae:174.64309
[8]	train-rmse:386.01403	train-mae:174.47361
[9

In [None]:
print(xgb_results.sort_values(by=['Avg MAE']))
print(xgb_results.sort_values(by=['Avg MAE']).iloc[0])

We see that the optimal hyperparameters found are max_depth=5, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8 min_child_weight=5, and num_boost_round=30.0# This configuration gives us a cross validation MAE of 67 and an RMSE of 162. Thus, we are going to use the random fores## model as our production model as it provides us with more robust performance metrics