# Ensemble

## Load data

In [194]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from collections import defaultdict
import datetime
import pickle
import math
import os

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import Lasso,Ridge
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


pd.set_option('display.max_columns', None) # no truncate columns

In [195]:
# Environment settings
data_path_out = 'Data/output/'
    
# Deserialize previously saved data from "data-visualization"
with open(data_path_out + 'train_pp.obj', 'rb') as file:
    all_train = pickle.load(file)


In [196]:
all_train_orig = all_train.copy()
all_train = all_train.drop('Region',axis=1)

### Drop now useless variables

In [197]:
all_train = all_train.drop(labels = ['NumberOfCustomers'],axis=1)
all_train = all_train.drop('Date',axis=1)

## Averaging model

In [198]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

## Stacked Model

In [199]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [200]:
# define the model
    
# model = Lasso(alpha=50)
# model2 = Ridge(alpha=1)
# model3 =XGBRegressor(max_depth=4,
#                             gamma=0.05, 
#                             learning_rate=0.05, 
#                                  n_estimators=500,
#                                  subsample=0.3, silent=1,
#                                  random_state =7, nthread = -1)

# model4 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
#                                        max_depth=2,loss='lad',random_state =5)


# model = AveragingModels(models = (model1,model2,model3,model4))

## Ensemble model

## Cross-validation store by store

In [201]:
# results = {}
# for storeid in all_train.StoreID.unique():
#     train = all_train[all_train.StoreID == storeid]
#     y_train = train.NumberOfSales
#     x_train = train.drop('NumberOfSales',axis = 1)
#     kfold = KFold(n_splits=10,shuffle = True, random_state=7)
#     results[storeid] = cross_val_score(model, x_train, y_train, scoring='r2', cv=kfold)
#     print('Cross-validation for {} -> score: {:.4f} with +/- {:.4f}'\
#           .format(storeid,results[storeid].mean(),results[storeid].std()))

In [202]:
# df_results = pd.DataFrame.from_dict(results).T
# df_results_mean = df_results.mean(axis=1)
# df_results_mean[df_results_mean < 0.8]

In [203]:
all_train = all_train.drop("Differential",axis=1)

## Lanzi Error

### Fit model and make predictions

In [204]:
# train by month
def split_dataset_bymonth(test_year, test_months, train_set):
    test_mask = (train.year == test_year) & train.month.isin(test_months)
    
    # define the train set
    train_dataset = train[~test_mask]
    x_train = train_dataset.drop('NumberOfSales', axis=1)
    y_train = train_dataset.NumberOfSales
    
    # define the test set
    test_dataset = train[test_mask]
    x_test = test_dataset.drop('NumberOfSales', axis=1)
    y_test = test_dataset.NumberOfSales
    
    return (x_train, y_train, x_test, y_test)

In [206]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor



scores = {}
scores_mae= {}
predictions = defaultdict(dict)
store_pred = {}
shopping_center_ids = [1129,1267,1280,1307,1330,1339,1357,1387,1676]
region2_stores = all_train[all_train['Region_2'] ==1]['StoreID'].unique()
ids = all_train.StoreID.unique()

for storeid in ids:
    
    model1= Lasso(alpha=20)
    model2 =XGBRegressor(n_estimators=100)
    model3 = GradientBoostingRegressor(n_estimators=50, max_depth=5,learning_rate=0.07,
                                      loss='huber',random_state =5)
    
    model4 = ExtraTreesRegressor(n_estimators=50)

    
    model = AveragingModels(models = (model1,model2,model3,model4)) 

    # split the dataset
    train = all_train[all_train.StoreID == storeid]
    
    x_train, y_train, x_test, y_test =\
    split_dataset_bymonth(2017,[3,4], train)
    
    # train the model with the training set
    model.fit(x_train, y_train)
              
    # scoring
    scores[storeid] = r2_score(y_test, model.predict(x_test))
    scores_mae[storeid]= mean_absolute_error(y_test, model.predict(x_test))
    
    print('store r2 {} -> {:.4f}'.format(storeid, scores[storeid]))
    print('store mae {} -> {:.2f}'.format(storeid, scores_mae[storeid]))
    
    store_pred[storeid] = scores[storeid]
    # predict the test set with the trained model
    for month in x_test.month.unique():
        # get daily predictions for each month in the test set
        month_prediction =model.predict(x_test[x_test.month == month]).astype("int")
        month_actual = y_test.loc[x_test[x_test.month == month].index].values
        
#         if scores[storeid] <0.6 :
#             ts_pred = pd.Series(month_prediction, index=x_test[x_test.month==month]['day_of_month']).\
#                 plot(figsize=(20,5), title='Region 2', marker='o')
#             ts_act = pd.Series(month_actual, index=x_test[x_test.month==month]['day_of_month']).\
#                 plot(figsize=(20,5), title='Region 2', marker='x')
#             plt.show()
        
        # store the monthly mean of the test set
        predictions[storeid][month] = {
            'predicted': np.sum(month_prediction),
            'actual': np.sum(month_actual)        
        }
        predictions[storeid]['r2']=scores[storeid]
        predictions[storeid]['mae']=scores_mae[storeid]


store r2 1000 -> 0.8179
store mae 1000 -> 902.23
store r2 1001 -> 0.8272
store mae 1001 -> 379.26
store r2 1002 -> 0.9624
store mae 1002 -> 327.66
store r2 1003 -> 0.9483
store mae 1003 -> 395.26
store r2 1004 -> 0.9505
store mae 1004 -> 254.66
store r2 1005 -> 0.9138
store mae 1005 -> 361.64
store r2 1006 -> 0.9256
store mae 1006 -> 546.09
store r2 1007 -> 0.8992
store mae 1007 -> 344.53
store r2 1008 -> 0.8723
store mae 1008 -> 704.88
store r2 1009 -> 0.9440
store mae 1009 -> 445.39
store r2 1010 -> 0.9107
store mae 1010 -> 512.39
store r2 1011 -> 0.9134
store mae 1011 -> 221.69
store r2 1012 -> 0.9685
store mae 1012 -> 371.39
store r2 1013 -> 0.9717
store mae 1013 -> 298.91
store r2 1014 -> 0.9229
store mae 1014 -> 330.70
store r2 1015 -> 0.9503
store mae 1015 -> 421.91
store r2 1016 -> 0.8952
store mae 1016 -> 382.10
store r2 1017 -> 0.9559
store mae 1017 -> 199.67
store r2 1018 -> 0.9345
store mae 1018 -> 294.73
store r2 1019 -> 0.9666
store mae 1019 -> 310.05
store r2 1020 -> 0.9

store r2 1168 -> 0.9410
store mae 1168 -> 454.96
store r2 1169 -> 0.9154
store mae 1169 -> 502.13
store r2 1170 -> 0.9680
store mae 1170 -> 360.58
store r2 1171 -> 0.8907
store mae 1171 -> 184.33
store r2 1172 -> 0.9276
store mae 1172 -> 439.26
store r2 1173 -> 0.9455
store mae 1173 -> 305.47
store r2 1174 -> 0.9359
store mae 1174 -> 390.45
store r2 1175 -> 0.9088
store mae 1175 -> 386.69
store r2 1176 -> 0.9631
store mae 1176 -> 399.99
store r2 1177 -> 0.9505
store mae 1177 -> 403.36
store r2 1178 -> 0.9377
store mae 1178 -> 385.84
store r2 1179 -> 0.9373
store mae 1179 -> 277.51
store r2 1180 -> 0.9703
store mae 1180 -> 229.20
store r2 1181 -> 0.9734
store mae 1181 -> 357.88
store r2 1182 -> 0.9178
store mae 1182 -> 203.80
store r2 1183 -> 0.8898
store mae 1183 -> 227.66
store r2 1184 -> 0.9378
store mae 1184 -> 297.80
store r2 1185 -> 0.9541
store mae 1185 -> 364.58
store r2 1186 -> 0.9517
store mae 1186 -> 412.31
store r2 1187 -> 0.8922
store mae 1187 -> 269.96
store r2 1188 -> 0.9

KeyboardInterrupt: 

### Compute Lanzi error

In [None]:
# set of regions
R = sorted(all_train_orig.Region.unique().astype(int))
# set of predicted months
months = [key for key, value in predictions[1000].items()]
# set of stores by region
dict_store_byRegion = all_train_orig[['Region', 'StoreID']].drop_duplicates()\
.set_index('StoreID').groupby('Region').groups

def region_error(region, predictions):    
    num = 0
    den = 0
    for store in dict_store_byRegion[str(region)]:
        for month in months:
            predicted = predictions[store][month]['predicted']
            actual = predictions[store][month]['actual']
            
            num += abs(actual - predicted)
            den += actual
    
    return num/den
    
# total_error input:
#
# region_errors = [0.3, 0.5, ... ]

def total_error(region_errors):
#     print(region_errors)
    return sum(region_errors)/len(region_errors)

def lanzi_error(predictions):
    region_errors = []
    for r in R:
        region_errors.append(region_error(r, predictions))
    
    return total_error(region_errors)

In [None]:
print('Lanzi error: {}'.format(lanzi_error(predictions)))