# AAVAIL Modelling

### Dataset

To sum up, AAVAIL managers have asked to build a service that, at any point in time, will predict the revenue for the following month, in general or for specific countries. To keep the development time reasonable the model should be limited to the ten countries with the most revenue.

The available data is stored in a set of several json files which represents a monthly data of AAVAIL's transaction for different countries. 

### Attribute Information:

The features found in the data are: 

- country
- customer_id
- day
- invoice
- month
- stream_id
- times_viewed
- total_price
- year

### Tasks

- State the different modeling approaches that you will compare to address the business opportunity.
- Iterate on your suite of possible models by modifying data transformations, pipeline architectures, hyperparameters and other relevant factors.
- Re-train your model on all of the data using the selected approach and prepare it for deployment.
- Articulate your findings in a summary report.


In [1]:
import os
import re
import time
import json
import numpy as np
import pandas as pd

from scipy.stats import truncnorm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from cslib import fetch_ts, engineer_features

In [2]:
data_dir = os.path.join("..","data","cs-train")

print("...fetching data")

ts_data = fetch_ts(data_dir,clean=False)

...fetching data
CsLib: Loading Timeseries Data from CSV Files


In [3]:
max(ts_data['all'].date)

'2019-06-30'

In [4]:
for country, df in ts_data.items():
    print(country, df.shape)

all (607, 7)
eire (607, 7)
france (607, 7)
germany (607, 7)
hong_kong (426, 7)
netherlands (607, 7)
norway (577, 7)
portugal (607, 7)
singapore (456, 7)
spain (607, 7)
united_kingdom (607, 7)


In [5]:
X,y,dates = engineer_features(ts_data['all'])

In [6]:
## Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((411, 7), (138, 7), (411,), (138,))

In [7]:
print(X_test)

     previous_7  previous_14  previous_28  previous_70  previous_year  \
195   47618.900    87134.610   154257.180   344024.442          0.000   
79    28658.491    52755.531   149632.744   336015.705          0.000   
479   33452.170    78700.720   146724.940   342694.890     168209.691   
109   74230.301   112441.281   175926.804   410792.319          0.000   
473   45248.550    83186.210   154930.280   345660.190     198752.871   
..          ...          ...          ...          ...            ...   
402   57369.120    57369.120   138866.160   793503.392     183479.624   
69    28902.811    89895.143   171139.024   400303.044          0.000   
261   27755.500    60623.000   129151.260   362049.200          0.000   
516   18799.510    51075.470   123990.861   351331.131     142885.530   
436   25980.220    53845.610   126857.530   601370.620     141194.594   

     recent_invoices  recent_views  
195        79.066667   6556.200000  
79         72.633333   5775.933333  
479        5

### RandomForestRegressor

In [14]:
param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25],
    'rf__max_depth': [5,10,15]
    }

time_start = time.time()
pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

rf_mae =  mean_absolute_error(y_test, y_pred)
rf_mse =  mean_squared_error(y_test, y_pred)
rf_r2_score = r2_score(y_test, y_pred)

print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("mae = {:.0f}".format(rf_mae))
print("mse = {:.0f}".format(rf_mse))
print("r2_score = {:.3f}".format(rf_r2_score))
print("best params =", grid.best_params_)

train time =  00:00:01
mae = 11002
mse = 272711018
r2_score = 0.958
best params = {'rf__criterion': 'mse', 'rf__max_depth': 15, 'rf__n_estimators': 15}


### GradientBoostingRegressor

In [None]:
param_grid_gb = {
    'gb__criterion': ['mse','mae'],
    'gb__n_estimators': [10,15,20,25,50,100]
    }

time_start = time.time()
pipe_gb = Pipeline(steps=[('scaler', StandardScaler()), ('gb', GradientBoostingRegressor())])

grid = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

gb_mae =  mean_absolute_error(y_test, y_pred)
gb_mse =  mean_squared_error(y_test, y_pred)
gb_r2_score = r2_score(y_test, y_pred)

print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("mae = {:.0f}".format(gb_mae))
print("mse = {:.0f}".format(gb_mse))
print("r2_score = {:.3f}".format(gb_r2_score))
print("best params =", grid.best_params_)

### LGBMRegressor

In [None]:
param_grid_dt = {
    'lgbm__learning_rate': [0.60,0.75,0.90],
    'lgbm__application': ['regression'],
    'lgbm__max_depth': [3,4,5]
    }

time_start = time.time()
pipe_ts = Pipeline(steps=[('scaler', StandardScaler()), ('lgbm', LGBMRegressor())])

grid = GridSearchCV(pipe_ts, param_grid=param_grid_dt, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

dt_mae =  mean_absolute_error(y_test, y_pred)
dt_mse =  mean_squared_error(y_test, y_pred)
dt_r2_score = r2_score(y_test, y_pred)

print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("mae = {:.0f}".format(dt_mae))
print("mse = {:.0f}".format(dt_mse))
print("r2_score = {:.3f}".format(dt_r2_score))
print("best params =", grid.best_params_)

### DecisionTreeRegressor

In [None]:
param_grid_dt = {
    'dt__criterion': ['mse','mae'],
    'dt__max_depth': [5,10,20,50],
    'dt__min_samples_leaf': [1,2,3,4,5]
    }

time_start = time.time()
pipe_ts = Pipeline(steps=[('scaler', StandardScaler()), ('dt', DecisionTreeRegressor())])

grid = GridSearchCV(pipe_ts, param_grid=param_grid_dt, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

dt_mae =  mean_absolute_error(y_test, y_pred)
dt_mse =  mean_squared_error(y_test, y_pred)
dt_r2_score = r2_score(y_test, y_pred)

print("train time = ", time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))
print("mae = {:.0f}".format(dt_mae))
print("mse = {:.0f}".format(dt_mse))
print("r2_score = {:.3f}".format(dt_r2_score))
print("best params =", grid.best_params_)