In [18]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

from cslib import fetch_ts, engineer_features

# 1. Loading Data

In [19]:
data_dir = os.path.join("data","cs-train")
ts_all = fetch_ts(data_dir,clean=False)
ts_all['all'].head()

... loading ts data from files


Unnamed: 0,date,purchases,unique_invoices,unique_streams,total_views,year_month,revenue
0,2017-11-01,0,0,0,0,2017-11,0.0
1,2017-11-02,0,0,0,0,2017-11,0.0
2,2017-11-03,0,0,0,0,2017-11,0.0
3,2017-11-04,0,0,0,0,2017-11,0.0
4,2017-11-05,0,0,0,0,2017-11,0.0


# 2. Feature Engineering and Train Test Split

### Generate features using 7, 14, 28, 70-day time window wraping, the monthly sum of previous year, the average number of invoices and the total views in rencent 30 days

In [20]:
X,y,dates = engineer_features(ts_all['all'])
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [21]:
X.shape

(549, 7)

In [22]:
y.shape

(549,)

# 3. Model Training and Perforamcne Comparision  

### 3.1. Gradient Boosting Regressor

In [49]:
param_grid_gb = {
    'gb__criterion': ['mse','mae'],
    'gb__n_estimators': [10,15,20,25,50,100]
    }

time_start = time.time()
pipe_gb = Pipeline(steps=[('scaler', StandardScaler()), ('gb', GradientBoostingRegressor())])

grid = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

gb_mae =  mean_absolute_error(y_test, y_pred)
gb_mse =  mean_squared_error(y_test, y_pred)
gb_r2_score = r2_score(y_test, y_pred)
gb_explained_variance_score = explained_variance_score(y_test, y_pred)

print("mae = {:.2f}".format(gb_mae))
print("mse = {:.2f}".format(gb_mse))
print("r2_score = {:.3f}".format(gb_r2_score))
print("explained_variance_score = {:.3f}".format(gb_explained_variance_score))
print("best params =", grid.best_params_)
print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("--------------------------------------------------------------------------------------")

mae = 16087.66
mse = 448470837.35
r2_score = 0.93
explained_variance_score = 0.93
best params = {'gb__criterion': 'mse', 'gb__n_estimators': 100}
--------------------------------------------------------------------------------------


### 3.2. Random Forest Regressor

In [66]:
param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25,50,100]
    }

time_start = time.time()
pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

rf_mae =  mean_absolute_error(y_test, y_pred)
rf_mse =  mean_squared_error(y_test, y_pred)
rf_r2_score = r2_score(y_test, y_pred)
rf_explained_variance_score = explained_variance_score(y_test, y_pred)

print("mae = {:.2f}".format(rf_mae))
print("mse = {:.2f}".format(rf_mse))
print("r2_score = {:.3f}".format(rf_r2_score))
print("explained_variance_score = {:.3f}".format(rf_explained_variance_score))
print("best params =", grid.best_params_)
print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("--------------------------------------------------------------------------------------")

mae = 11463.90
mse = 297141752.80
r2_score = 0.955
explained_variance_score = 0.955
best params = {'rf__criterion': 'mse', 'rf__n_estimators': 15}
train time =  00:00:08
--------------------------------------------------------------------------------------


### 3.3. Multilayer Perceptron (MLP) Regressor
A multilayer perceptron (MLP) is also known as a vanilla neural network because it is the core example of an architecture. The vanilla neural networks often only have a single hidden layer, but a MLP can have many more. The number of hidden layers and the size (number of nodes in each) are configurable parameters that you will need to keep in mind when building neural networks.

In [65]:
from sklearn.neural_network import MLPRegressor
rs = 5
param_grid = {
    'nn__activation': ['relu'],
    'nn__solver': ['lbfgs', 'sgd','adam'],
    'nn__hidden_layer_sizes': [(10,10), (50,50), (64,64)]
    }

time_start = time.time()
pipe  = Pipeline(steps=[('scaler', StandardScaler()),
                            ('nn', MLPRegressor(alpha=1e-5, random_state=rs, max_iter=1000))])


grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

nn_mae =  mean_absolute_error(y_test, y_pred)
nn_mse =  mean_squared_error(y_test, y_pred)
nn_r2_score = r2_score(y_test, y_pred)
nn_explained_variance_score = explained_variance_score(y_test, y_pred)

print("mae = {:.2f}".format(nn_mae))
print("mse = {:.2f}".format(nn_mse))
print("r2_score = {:.3f}".format(nn_r2_score))
print("explained_variance_score = {:.3f}".format(nn_explained_variance_score))
print("best params =", grid.best_params_)
print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("--------------------------------------------------------------------------------------")


  array_means[:, np.newaxis]) ** 2,


mae = 10653.67
mse = 286336577.54
r2_score = 0.956
explained_variance_score = 0.956
best params = {'nn__activation': 'relu', 'nn__hidden_layer_sizes': (64, 64), 'nn__solver': 'lbfgs'}
train time =  00:00:38
--------------------------------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
