# Ensemble

## Load data

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from collections import defaultdict
import datetime
import pickle
import math
import os
from xgboost import plot_importance

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import Lasso,Ridge
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


pd.set_option('display.max_columns', None) # no truncate columns

In [18]:
# Environment settings
data_path_out = 'Data/output/'
    
# Deserialize previously saved data from "data-visualization"
with open(data_path_out + 'train_pp.obj', 'rb') as file:
    all_train = pickle.load(file)
    
all_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523021 entries, 0 to 523020
Data columns (total 89 columns):
CloudCover                                 523021 non-null float32
Date                                       523021 non-null datetime64[ns]
HasPromotions                              523021 non-null bool
IsHoliday                                  523021 non-null bool
IsOpen                                     523021 non-null bool
Max_Dew_PointC                             523021 non-null int8
Max_Humidity                               500021 non-null float64
Max_Sea_Level_PressurehPa                  523021 non-null uint16
Max_TemperatureC                           523021 non-null int8
Max_VisibilityKm                           500021 non-null float32
Max_Wind_SpeedKm_h                         523021 non-null uint8
Mean_Dew_PointC                            523021 non-null int8
Mean_Humidity                              500021 non-null float64
Mean_Sea_Level_PressurehPa      

### Drop now useless variables

In [13]:
all_train = all_train.drop(labels = ['NumberOfCustomers'],axis=1)
all_train = all_train.drop('Date',axis=1)

## Averaging model

In [14]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

## Ensemble model

In [15]:
# define the model
    
model1 = Lasso(alpha=50)
model2 = Ridge(alpha=1)
model3 =XGBRegressor(max_depth=4,
                            gamma=0.05, 
                            learning_rate=0.05, 
                                 n_estimators=500,
                                 subsample=0.3, silent=1,
                                 random_state =7, nthread = -1)

model4 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                       max_depth=2,loss='lad',random_state =5)


model = AveragingModels(models = (model1,model2,model3,model4))

## Cross-validation store by store

In [17]:
results = {}
for storeid in all_train.StoreID.unique():
    train = all_train[all_train.StoreID == storeid]
    y_train = train.NumberOfSales
    x_train = train.drop('NumberOfSales',axis = 1)
    kfold = KFold(n_splits=10,shuffle = True, random_state=7)
    results[storeid] = cross_val_score(model, x_train, y_train, scoring='r2', cv=kfold)
    print('Cross-validation for {} -> score: {:.4f} with +/- {:.4f}'\
          .format(storeid,results[storeid].mean(),results[storeid].std()))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [7]:
df_results = pd.DataFrame.from_dict(results).T
df_results_mean = df_results.mean(axis=1)
df_results_mean[df_results_mean < 0.8]

1001    0.799816
1194    0.752998
1267    0.784283
1330    0.515442
1339    0.735149
1546    0.394948
1729    0.583855
dtype: float64

## Lanzi Error

### Fit model and make predictions

In [9]:
# train by month
def split_dataset_bymonth(test_year, test_months, train_set):
    test_mask = (train.year == test_year) & train.month.isin(test_months)
    
    # define the train set
    train_dataset = train[~test_mask]
    x_train = train_dataset.drop('NumberOfSales', axis=1)
    y_train = train_dataset.NumberOfSales
    
    # define the test set
    test_dataset = train[test_mask]
    x_test = test_dataset.drop('NumberOfSales', axis=1)
    y_test = test_dataset.NumberOfSales
    
    return (x_train, y_train, x_test, y_test)

In [10]:
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

scores = {}
predictions = defaultdict(dict)
store_pred = {}
shopping_center_ids = [1129,1267,1280,1307,1330,1339,1357,1387,1676]
ids = train_df.StoreID.unique()
for storeid in ids:


    # split the dataset
    train = train_df[train_df.StoreID == storeid]
    
    x_train, y_train, x_test, y_test =\
    split_dataset_bymonth(2018, [1,2], train)
    
    # train the model with the training set
    model.fit(x_train, y_train)
    
    
    # scoring
    scores[storeid] = r2_score(y_test, model.predict(x_test))
    print('store {} -> {:.4f}'.format(storeid, scores[storeid]))
    store_pred[storeid] = scores[storeid]
    # predict the test set with the trained model
    for month in x_test.month.unique():
        # get daily predictions for each month in the test set
        month_prediction =model.predict(x_test[x_test.month == month])
        month_actual = y_test.loc[x_test[x_test.month == month].index].values
        
        # store the monthly mean of the test set
        predictions[storeid][month] = {
            'predicted': np.mean(month_prediction),
            'actual': np.mean(month_actual)
        }

store 1000 -> 0.8652
store 1001 -> 0.5923
store 1002 -> 0.9507
store 1003 -> 0.9390
store 1004 -> 0.9075
store 1005 -> 0.9011
store 1006 -> 0.8960
store 1007 -> 0.9314
store 1008 -> 0.0465
store 1009 -> 0.9304
store 1010 -> 0.9031
store 1011 -> 0.8322
store 1012 -> 0.9433
store 1013 -> 0.9634
store 1014 -> 0.8061
store 1015 -> 0.9046
store 1016 -> 0.8560
store 1017 -> 0.9498
store 1018 -> 0.9424
store 1019 -> 0.9357
store 1020 -> 0.9071
store 1021 -> 0.9493
store 1022 -> 0.9673
store 1023 -> 0.8875
store 1024 -> 0.9308
store 1025 -> 0.8855
store 1026 -> 0.9428
store 1027 -> 0.9427
store 1028 -> 0.8730
store 1029 -> 0.8482
store 1030 -> 0.9187
store 1031 -> 0.8863
store 1032 -> 0.9274
store 1033 -> 0.9632
store 1034 -> 0.9064
store 1035 -> 0.9214
store 1036 -> 0.9308
store 1037 -> 0.1532
store 1038 -> 0.9290
store 1039 -> 0.9628
store 1040 -> 0.9447
store 1041 -> 0.9422
store 1042 -> 0.8619
store 1043 -> 0.9190
store 1044 -> 0.9181
store 1045 -> 0.9299
store 1046 -> 0.9277
store 1047 ->

store 1391 -> 0.8892
store 1392 -> 0.9555
store 1393 -> 0.9565
store 1394 -> 0.9358
store 1395 -> 0.9199
store 1396 -> 0.9490
store 1397 -> 0.9378
store 1398 -> 0.9502
store 1399 -> 0.8978
store 1400 -> 0.9251
store 1401 -> 0.9212
store 1402 -> 0.9641
store 1403 -> 0.9032
store 1404 -> 0.9701
store 1405 -> 0.9529
store 1406 -> 0.8046
store 1407 -> 0.6616
store 1408 -> 0.9048
store 1409 -> 0.8467
store 1410 -> 0.8412
store 1411 -> 0.9016
store 1412 -> 0.9763
store 1413 -> 0.8845
store 1414 -> 0.9504
store 1415 -> 0.9237
store 1416 -> 0.9228
store 1417 -> 0.8940
store 1418 -> 0.9475
store 1419 -> 0.9124
store 1420 -> 0.9639
store 1421 -> 0.9188
store 1422 -> 0.6838
store 1423 -> 0.9452
store 1424 -> 0.8640
store 1425 -> 0.9571
store 1426 -> 0.9411
store 1427 -> 0.9336
store 1428 -> 0.9453
store 1429 -> 0.7693
store 1430 -> 0.8384
store 1431 -> 0.9456
store 1432 -> 0.8995
store 1433 -> 0.8631
store 1434 -> 0.9424
store 1435 -> 0.9115
store 1436 -> 0.9146
store 1437 -> 0.9508
store 1438 ->

### Compute Lanzi error

In [11]:
# set of regions
R = sorted(all_train_orig.Region.unique().astype(int))
# set of predicted months
months = [key for key, value in predictions[1000].items()]
# set of stores by region
dict_store_byRegion = all_train_orig[['Region', 'StoreID']].drop_duplicates()\
.set_index('StoreID').groupby('Region').groups


def region_error(region, predictions):    
    num = 0
    den = 0
    for store in dict_store_byRegion[str(region)]:
        for month in months:
            predicted = predictions[store][month]['predicted']
            actual = predictions[store][month]['actual']
            
            num += abs(actual - predicted)
            den += actual
    
    return num/den
    
# total_error input:
#
# region_errors = [0.3, 0.5, ... ]

def total_error(region_errors):
    return sum(region_errors)/len(region_errors)

def lanzi_error(predictions):
    region_errors = []
    for r in R:
        region_errors.append(region_error(r, predictions))
    
    return total_error(region_errors)

AttributeError: 'DataFrame' object has no attribute 'Region'

In [None]:
print('Lanzi error: {}'.format(lanzi_error(predictions)))