# Ensemble

## Load data

In [43]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from collections import defaultdict
import datetime
import pickle
import math
import os
from xgboost import plot_importance

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import Lasso,Ridge
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


pd.set_option('display.max_columns', None) # no truncate columns

In [44]:
# Environment settings
data_path_out = 'Data/output/'
    
# Deserialize previously saved data from "data-visualization"
with open(data_path_out + 'train_pp.obj', 'rb') as file:
    all_train = pickle.load(file)


In [45]:
all_train_orig = all_train.copy()
all_train = all_train.drop('Region',axis=1)

### Drop now useless variables

In [46]:
all_train = all_train.drop(labels = ['NumberOfCustomers'],axis=1)
all_train = all_train.drop('Date',axis=1)

## Averaging model

In [47]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

## Stacked Model

In [48]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [49]:
# define the model
    
# model = Lasso(alpha=50)
# model2 = Ridge(alpha=1)
# model3 =XGBRegressor(max_depth=4,
#                             gamma=0.05, 
#                             learning_rate=0.05, 
#                                  n_estimators=500,
#                                  subsample=0.3, silent=1,
#                                  random_state =7, nthread = -1)

# model4 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
#                                        max_depth=2,loss='lad',random_state =5)


# model = AveragingModels(models = (model1,model2,model3,model4))

## Ensemble model

## Cross-validation store by store

In [50]:
# results = {}
# for storeid in all_train.StoreID.unique():
#     train = all_train[all_train.StoreID == storeid]
#     y_train = train.NumberOfSales
#     x_train = train.drop('NumberOfSales',axis = 1)
#     kfold = KFold(n_splits=10,shuffle = True, random_state=7)
#     results[storeid] = cross_val_score(model, x_train, y_train, scoring='r2', cv=kfold)
#     print('Cross-validation for {} -> score: {:.4f} with +/- {:.4f}'\
#           .format(storeid,results[storeid].mean(),results[storeid].std()))

In [51]:
# df_results = pd.DataFrame.from_dict(results).T
# df_results_mean = df_results.mean(axis=1)
# df_results_mean[df_results_mean < 0.8]

In [52]:
all_train = all_train.drop("Differential",axis=1)

## Lanzi Error

### Fit model and make predictions

In [53]:
# train by month
def split_dataset_bymonth(test_year, test_months, train_set):
    test_mask = (train.year == test_year) & train.month.isin(test_months)
    
    # define the train set
    train_dataset = train[~test_mask]
    x_train = train_dataset.drop('NumberOfSales', axis=1)
    y_train = train_dataset.NumberOfSales
    
    # define the test set
    test_dataset = train[test_mask]
    x_test = test_dataset.drop('NumberOfSales', axis=1)
    y_test = test_dataset.NumberOfSales
    
    return (x_train, y_train, x_test, y_test)

In [81]:
# from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import plot_importance
from matplotlib import pyplot




scores = {}
predictions = defaultdict(dict)
store_pred = {}
shopping_center_ids = [1129,1267,1280,1307,1330,1339,1357,1387,1676]
ids = all_train.StoreID.unique()
for storeid in ids:
    
    model1 = Lasso(alpha=20)
    model2 = Ridge(alpha=7)
    model3 =XGBRegressor()

    model4 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                       max_depth=2,loss='lad',random_state =5)


    model5 = ExtraTreesRegressor()
    
    model = AveragingModels(models = (model1,model2,model3,model4,model5)) 
    
    # split the dataset
    train = all_train[all_train.StoreID == storeid]
        
    x_train, y_train, x_test, y_test =\
    split_dataset_bymonth(2017, [3,4], train)
    
    # train the model with the training set
    model.fit(x_train, y_train)
    
    # scoring
    scores[storeid] = r2_score(y_test, model.predict(x_test))
    print('store {} -> {:.4f}'.format(storeid, scores[storeid]))
    store_pred[storeid] = scores[storeid]
    # predict the test set with the trained model
    for month in x_test.month.unique():
        # get daily predictions for each month in the test set
        month_prediction1 =model.predict(x_test[x_test.month == month])
        month_actual = y_test.loc[x_test[x_test.month == month].index].values
        
        # store the monthly mean of the test set
        predictions[storeid][month] = {
            'predicted': np.sum(month_prediction),
            'actual': np.sum(month_actual)
        }

store 1000 -> 0.8062
store 1001 -> 0.8053
store 1002 -> 0.9569
store 1003 -> 0.9369
store 1004 -> 0.9492
store 1005 -> 0.9115
store 1006 -> 0.9159
store 1007 -> 0.8923
store 1008 -> 0.8381
store 1009 -> 0.9293
store 1010 -> 0.9010
store 1011 -> 0.9132
store 1012 -> 0.9630
store 1013 -> 0.9619
store 1014 -> 0.9215
store 1015 -> 0.9395
store 1016 -> 0.8904
store 1017 -> 0.9468
store 1018 -> 0.9245
store 1019 -> 0.9567
store 1020 -> 0.9069
store 1021 -> 0.9547
store 1022 -> 0.9243
store 1023 -> 0.9508
store 1024 -> 0.9405
store 1025 -> 0.8528
store 1026 -> 0.9473
store 1027 -> 0.9299
store 1028 -> 0.8854
store 1029 -> 0.9551
store 1030 -> 0.9162
store 1031 -> 0.8641
store 1032 -> 0.9510
store 1033 -> 0.9754
store 1034 -> 0.8985
store 1035 -> 0.9604
store 1036 -> 0.9225
store 1037 -> 0.8780
store 1038 -> 0.9069
store 1039 -> 0.9475
store 1040 -> 0.9529
store 1041 -> 0.9312
store 1042 -> 0.9321
store 1043 -> 0.8470
store 1044 -> 0.9552
store 1045 -> 0.9386
store 1046 -> 0.9165
store 1047 ->



store 1280 -> 0.8175
store 1281 -> 0.8983
store 1282 -> 0.9156
store 1283 -> 0.9238
store 1284 -> 0.9118
store 1285 -> 0.8481
store 1286 -> 0.9652
store 1287 -> 0.9432
store 1288 -> 0.9273
store 1289 -> 0.9094
store 1290 -> 0.8500
store 1291 -> 0.8695
store 1292 -> 0.9223
store 1293 -> 0.9176
store 1294 -> 0.8510
store 1295 -> 0.9062
store 1296 -> 0.8827
store 1297 -> 0.9578
store 1298 -> 0.9401
store 1299 -> 0.8979
store 1300 -> 0.9570
store 1301 -> 0.9109
store 1302 -> 0.9542
store 1303 -> 0.7846
store 1304 -> 0.9558
store 1305 -> 0.8739
store 1306 -> 0.8810
store 1307 -> 0.5976
store 1308 -> 0.9665
store 1309 -> 0.9242
store 1310 -> 0.8517
store 1311 -> 0.9079
store 1312 -> 0.8652
store 1313 -> 0.9423
store 1314 -> 0.9156
store 1315 -> 0.8732
store 1316 -> 0.9328
store 1317 -> 0.9111
store 1318 -> 0.9156
store 1319 -> 0.9199
store 1320 -> 0.9653
store 1321 -> 0.9392
store 1322 -> 0.9261
store 1323 -> 0.9239
store 1324 -> 0.8949
store 1325 -> 0.9547
store 1326 -> 0.8847
store 1327 ->



store 1346 -> 0.8427
store 1347 -> 0.9427
store 1348 -> 0.9025
store 1349 -> 0.8273
store 1350 -> 0.9436
store 1351 -> 0.8538
store 1352 -> 0.7958
store 1353 -> 0.9410
store 1354 -> 0.9264
store 1355 -> 0.9694
store 1356 -> 0.9321
store 1357 -> 0.8060
store 1358 -> 0.9591
store 1359 -> 0.8862
store 1360 -> 0.9138
store 1361 -> 0.9710
store 1362 -> 0.9202
store 1363 -> 0.9506
store 1364 -> 0.8525
store 1365 -> 0.9564
store 1366 -> 0.9510
store 1367 -> 0.8273
store 1368 -> 0.9107
store 1369 -> 0.9270
store 1370 -> 0.8165
store 1371 -> 0.9531
store 1372 -> 0.9404
store 1373 -> 0.9114
store 1374 -> 0.9004
store 1375 -> 0.9368
store 1376 -> 0.9571
store 1377 -> 0.9302
store 1378 -> 0.9017
store 1379 -> 0.9201
store 1380 -> 0.9089
store 1381 -> 0.9635
store 1382 -> 0.9256
store 1383 -> 0.9478
store 1384 -> 0.9218
store 1385 -> 0.9112
store 1386 -> 0.9706
store 1387 -> 0.7782
store 1388 -> 0.9305
store 1389 -> 0.9397
store 1390 -> 0.9312
store 1391 -> 0.9342
store 1392 -> 0.9132
store 1393 ->



store 1711 -> 0.9042
store 1712 -> 0.9165
store 1713 -> 0.9054
store 1714 -> 0.8887
store 1715 -> 0.9393
store 1716 -> 0.9016
store 1717 -> 0.9182
store 1718 -> 0.8803
store 1719 -> 0.8940
store 1720 -> 0.7151
store 1721 -> 0.9488
store 1722 -> 0.8648
store 1723 -> 0.9069
store 1724 -> 0.9321
store 1725 -> 0.9523
store 1726 -> 0.9398
store 1727 -> 0.9274
store 1728 -> 0.8587
store 1729 -> 0.8106
store 1730 -> 0.9351
store 1731 -> 0.9466
store 1732 -> 0.9208
store 1733 -> 0.8595
store 1734 -> 0.9072
store 1735 -> 0.9261
store 1736 -> 0.9388
store 1737 -> 0.9057
store 1738 -> 0.9482
store 1739 -> 0.6765
store 1740 -> 0.9242
store 1741 -> 0.9304
store 1742 -> 0.9394
store 1743 -> 0.9222
store 1744 -> 0.9322
store 1745 -> 0.9523
store 1746 -> 0.8906
store 1747 -> 0.9411
store 1748 -> 0.9232


### Compute Lanzi error

In [82]:
# set of regions
R = sorted(all_train_orig.Region.unique().astype(int))
# set of predicted months
months = [key for key, value in predictions[1000].items()]
# set of stores by region
dict_store_byRegion = all_train_orig[['Region', 'StoreID']].drop_duplicates()\
.set_index('StoreID').groupby('Region').groups


def region_error(region, predictions):    
    num = 0
    den = 0
    for store in dict_store_byRegion[str(region)]:
        for month in months:
            predicted = predictions[store][month]['predicted']
            actual = predictions[store][month]['actual']
            
            num += abs(actual - predicted)
            den += actual
    
    return num/den
    
# total_error input:
#
# region_errors = [0.3, 0.5, ... ]

def total_error(region_errors):
#     print(region_errors)
    return sum(region_errors)/len(region_errors)

def lanzi_error(predictions):
    region_errors = []
    for r in R:
        region_errors.append(region_error(r, predictions))
    
    return total_error(region_errors)

In [83]:
print('Lanzi error: {}'.format(lanzi_error(predictions)))

Lanzi error: 0.26228652696055943
