In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from make_datasets_w_categorical import make_datasets_w_categorical
import pickle

In [7]:
data = pd.read_csv('norway_new_car_sales_by_make1.csv')

In [8]:
data['Date'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2)

In [9]:
### Bring dataset to a tabular form
df = pd.pivot_table(data=data, values='Quantity', index='Make', columns='Date', aggfunc='sum', fill_value=0)

In [11]:
df.head()

Date,2007-01,2007-02,2007-03,2007-04,2007-05,2007-06,2007-07,2007-08,2007-09,2007-10,...,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12,2017-01
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alfa Romeo,16,9,21,20,17,21,14,12,15,10,...,3,1,2,1,6,15,3,4,3,6
Aston Martin,0,0,1,0,4,3,3,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Audi,599,498,682,556,630,498,562,590,393,554,...,685,540,551,687,794,688,603,645,827,565
BMW,352,335,365,360,431,477,403,348,271,562,...,1052,832,808,636,1031,1193,1096,1663,866,1540
Bentley,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0


In [13]:
# add the car brand as encoded categorical column - this is one main power of ML models; they can include more features like exogenous information etc.
df['Brand'] = df.index.astype('category').codes

In [14]:
df.head()

Date,2007-01,2007-02,2007-03,2007-04,2007-05,2007-06,2007-07,2007-08,2007-09,2007-10,...,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12,2017-01,Brand
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alfa Romeo,16,9,21,20,17,21,14,12,15,10,...,1,2,1,6,15,3,4,3,6,0
Aston Martin,0,0,1,0,4,3,3,0,0,0,...,0,1,0,0,0,0,0,0,0,1
Audi,599,498,682,556,630,498,562,590,393,554,...,540,551,687,794,688,603,645,827,565,2
BMW,352,335,365,360,431,477,403,348,271,562,...,832,808,636,1031,1193,1096,1663,866,1540,3
Bentley,0,0,0,0,0,1,0,0,0,0,...,0,1,1,1,0,0,0,0,0,4


In [15]:
 ### save our model because we are going to need it for our forecasting flask app
df.to_csv('tabular_sales_forML.csv')                

Random Forests algorithm does not recognize the historical aspect and the temporal order of the dataItt uses the typical ML format fg(x) = y, so the date columns that we created 2007-1, 2007-2, ... and Brand  are just typical Χ features for the algoritm


## Train-test split

In [16]:
X_train, y_train, X_test, y_test = make_datasets_w_categorical(df, x_len=24, y_len=12, test_loops=12, cat_names = ['Brand'])
print('The training set has {} rows and {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('The test set has {} rows and {} columns'.format(X_test.shape[0], X_test.shape[1]))
print('The target vector consists of {} values meaning that the model predicts one year ahead'.format(y_train.shape[1]))

The training set has 4810 rows and 25 columns
The test set has 780 rows and 25 columns
The target vector consists of 12 values meaning that the model predicts one year ahead


In [17]:
print(X_train)

[[   0   16    9 ...    9    7    7]
 [   1    0    0 ...    0    0    0]
 [   2  599  498 ...  578  522  625]
 ...
 [  62 1592 1440 ... 1920 2019 2057]
 [  63  826  826 ...  950 2072  321]
 [  64    0    0 ...    0    0    0]]


In [18]:
print(y_train)

[[   6    2    9 ...    0    4    0]
 [   0    0    0 ...    0    0    0]
 [ 221  325  323 ...  510  549  677]
 ...
 [1895 2274 2667 ... 2346 1881 1743]
 [ 438  875  729 ...  937 1512  643]
 [   0    0    0 ...    0    0    0]]


## Model Tuning

In [19]:
### hyperparameters
max_depth = [range(5,11)] + [None]
min_samples_split = range(5,20)
min_samples_leaf = range(2,15)
n_estimators = range(10,50,10)

In [20]:
params = {'max_depth':max_depth,
          'n_estimators':n_estimators,
          'min_samples_split':min_samples_split,
          'min_samples_leaf':min_samples_leaf}

As stated before the random forest model does not know anything about the temporal structure of the time series so we can use a classic cross-validation technique that is going to randomly pick records from the training set

More Info on cross-validation: https://www.geeksforgeeks.org/cross-validation-machine-learning
                               https://scikit-learn.org/stable/modules/cross_validation.html

In [29]:
rf = RandomForestRegressor(n_jobs=1, random_state=33)
rf_cv = RandomizedSearchCV(rf, params, cv=3, n_jobs=-1, verbose=1, n_iter=500, scoring='neg_mean_absolute_error', random_state=33)
rf_cv.fit(X_train, y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


717 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
717 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\vabal\anaconda3\envs\ml-zoomcamp\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\vabal\anaconda3\envs\ml-zoomcamp\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\Users\vabal\anaconda3\envs\ml-zoomcamp\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\vabal\anaconda3\envs\ml-zoomcamp\Lib\site-packages\sklearn\utils\_param_validation.

In [30]:
print('Best Random Forest parameters:', rf_cv.best_params_)

Best Random Forest parameters: {'n_estimators': 40, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_depth': None}


In [31]:
# let's see the results for the optimal model
train_preds = rf_cv.predict(X_train)
test_preds  = rf_cv.predict(X_test)

In [32]:
results = pd.DataFrame(columns = ['MAE', 'RMSE'], index=['Train','Test'])
results.loc['Train', 'MAE'] = np.mean(abs(y_train - train_preds))
results.loc['Train', 'RMSE'] = np.sqrt(np.mean((y_train - train_preds)**2))
results.loc['Test', 'MAE'] = np.mean(abs(y_test - test_preds))
results.loc['Test', 'RMSE'] = np.sqrt(np.mean((y_test - test_preds)**2))
results = results.astype(float).round(1)

In [33]:
print(results)
# We can see the power of the Random Forest algorithm. For 780 test samples of mixed car brands it achieved a MAE of 41,2 units.

        MAE   RMSE
Train  29.0   72.1
Test   41.2  100.6


## Train the final model

Finally, train the best model on full dataset - of course we leave out the last 24 observations/months (time window) for each car brand because the corresponding sales do not exist; they will be the actual forecasts when we deploy our model# to production as they are the future sales that are not part of our dataset

In [34]:
X_train_full, y_train_full, X_test_future, y_test_future = make_datasets_w_categorical(df, x_len=24, y_len=12, test_loops=0, cat_names = ['Brand'])

In [35]:
rf_final = RandomForestRegressor(**rf_cv.best_params_, n_jobs=-1, random_state=33)
rf_final = rf_final.fit(X_train_full, y_train_full)

## Save the model

In [36]:
with open('random_forest.bin', 'wb') as f_out:
    pickle.dump(rf_final, f_out)