# Random Forest

In [443]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
from datetime import datetime
from datetime import timedelta
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
import  plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from sklearn.ensemble import RandomForestRegressor
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import os
import colorlover  as cl
from tqdm import tqdm
from scipy.special import boxcox1p
from scipy.stats import skew



# environment settings
data_path_out = 'Data/output/'

#hack to avoid showing deprecationg warnings
warnings.filterwarnings("ignore") 
init_notebook_mode(connected=True)


# Deserialize previously saved data from "preprocessing"
with open(data_path_out+'train_pp.obj', 'rb') as train_pp:
    train_df = pickle.load(train_pp)

train_df_orig = train_df.copy()
#Dummies
train_df = pd.get_dummies(train_df)
train_df.shape

(523021, 73)

## Check that all closed stores don't have sales

In [444]:
train_df[(train_df['IsOpen'] == False) & (train_df['NumberOfSales']>0)]

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,...,Events_Rain-Hail,Events_Rain-Hail-Thunderstorm,Events_Rain-Snow,Events_Rain-Snow-Hail,Events_Rain-Snow-Hail-Thunderstorm,Events_Rain-Snow-Thunderstorm,Events_Rain-Thunderstorm,Events_Snow,Events_Snow-Hail,Events_Thunderstorm


## New time features

In [445]:
train_df['day'] = train_df.Date.dt.day
train_df['month'] = train_df.Date.dt.month
train_df['year'] = train_df.Date.dt.year
train_df['WeekOfYear'] = train_df.Date.dt.week
train_df['DayOfWeek'] = train_df.Date.dt.dayofweek
train_df['DaysInMonth'] = train_df.Date.dt.daysinmonth
train_df['DayOfYear'] = train_df.Date.dt.dayofyear

## New sales features

In [446]:

def num_of_features(regional_week_year,week,year,column_of_interest):
    num_holidays=0

    if week ==1:
        week_before=[52,year-1]
        week_after=[2,year]
    if week == 52:
        week_after=[1,year+1]
        week_before=[51,year]
    if (week>1) & (week<52):
        week_before=[week-1,year]
        week_after=[week+1,year]
    
    before_df = regional_week_year[(regional_week_year['WeekOfYear']==week_before[0])&
                                  (regional_week_year['year']==week_before[1])]
    
    after_df = regional_week_year[(regional_week_year['WeekOfYear']==week_after[0])&
                                  (regional_week_year['year']==week_after[1])]
    this_df = regional_week_year[(regional_week_year['WeekOfYear']==week)&
                                 (regional_week_year['year']==year)]
    
    num_ago = before_df[before_df[column_of_interest]][column_of_interest].sum()
    num_next = after_df[after_df[column_of_interest]][column_of_interest].sum()
    num_this = this_df[this_df[column_of_interest]][column_of_interest].sum()
    
    return num_ago,num_next,num_this

def create_new_features(before,current,after,column_of_interest):
    train_df[before] = 0
    train_df[current]=0
    train_df[after] = 0

    region_list= ["Region_"+str(d) for d in range(0,11)]
    for region in tqdm(region_list):
        curr_region = train_df[train_df[region]==1]

        #get all valid dates of that region
        regional_week_year=curr_region[['WeekOfYear','year',column_of_interest]]

        #get all store ids of that region
        regional_stores=len(curr_region['StoreID'].unique())

        week_year_list =regional_week_year[['WeekOfYear','year']].drop_duplicates().values.tolist()

        for date in week_year_list:
            num_ago,num_next,num_this = num_of_features(regional_week_year,
                                                        date[0],
                                                        date[1],
                                                        column_of_interest)

            train_df.at[((train_df['WeekOfYear']==date[0])&
                         (train_df['year']==date[1])),before]=int(num_ago/regional_stores)
            train_df.at[((train_df['WeekOfYear']==date[0])&
                         (train_df['year']==date[1])),after]=int(num_next/regional_stores)
            train_df.at[((train_df['WeekOfYear']==date[0])&
                         (train_df['year']==date[1])),current]=int(num_this/regional_stores)

In [447]:
create_new_features('HolidaysWeekBefore','HolidaysWeekCurrent','HolidaysWeekAfter','IsHoliday')
create_new_features('PromoWeekBefore','PromoWeekCurrent','PromoWeekAfter','HasPromotions')

100%|██████████| 11/11 [01:33<00:00,  8.49s/it]
100%|██████████| 11/11 [01:41<00:00,  9.18s/it]


## Drop NumberOfCustomers

In [448]:
train_df =train_df.drop('NumberOfCustomers',axis=1)

## Drop closed stores

In [449]:
# mask_closed = (train_df['IsOpen']==False)
# train_df=train_df[~mask_closed]
# train_df.shape

## Date dictionary

In [450]:
all_dates = pd.DatetimeIndex(train_df['Date'])
month_dict = { d.strftime("%B%Y") :datetime.strptime(d.strftime('%m-%Y'),'%m-%Y') for d in all_dates}

## PCA -> useless

In [428]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

train_df2 = train_df.drop('Date',axis=1)
sc= StandardScaler()
train_scaled = sc.fit_transform(train_df2.values)
cov_mat = np.cov(train_scaled.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
tot = sum(eigen_vals)
var_exp = [(i/tot) for i in sorted(eigen_vals,reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print(cum_var_exp)

[0.08676394 0.14013837 0.18642801 0.22880135 0.26774312 0.30373827
 0.33464839 0.36136781 0.38706481 0.41161405 0.4337767  0.45417583
 0.47373189 0.49222562 0.50952897 0.52668562 0.54307328 0.55841678
 0.57346096 0.58838674 0.60228555 0.61556069 0.62861582 0.64146395
 0.65409994 0.66623234 0.67831901 0.69031365 0.70226763 0.71416616
 0.72602676 0.73787622 0.74967435 0.76144226 0.77319758 0.7849209
 0.79662476 0.80817145 0.81965038 0.83097105 0.84223815 0.85344159
 0.86442826 0.87444916 0.88408404 0.89342405 0.90218681 0.91091824
 0.91934789 0.92707515 0.93439986 0.94140194 0.94783679 0.9542359
 0.9603776  0.96578768 0.97057429 0.97508781 0.97933529 0.98340357
 0.98708497 0.9902571  0.99301703 0.99467599 0.9960731  0.99726338
 0.99809355 0.99869529 0.99911185 0.99950283 0.99969718 0.99982258
 0.99990028 0.99997304 0.99999987 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.        ]


## 1. Method Training and predicting one-hold out 

In [16]:
from sklearn.decomposition import PCA

def monthly_training(training_interval=['03-2017','02-2018'],
                     verbose=False,
                     regressor=RandomForestRegressor(n_estimators=50,n_jobs=-1)):
    
    init_date=datetime.strptime(training_interval[0],'%m-%Y')
    end_date =datetime.strptime(training_interval[1],'%m-%Y')
    
    month_scores = []
    y_test_list = []
    predicted_list= []
    predicted_df_list = []
    training_months = []
    training_df_list = []
    for month_name, date in month_dict.items():

        if (date >= init_date) & (date<= end_date) :
                if verbose== True:
                    print("Doing month",month_name)
                    
                training_months.append(month_name)
                mask = ((train_df['month']==date.month) & (train_df['year']==date.year))
                test = train_df[mask]
                train = train_df[~mask]
                
                y_train = train.NumberOfSales
                X_train = train.drop('NumberOfSales',axis = 1)

                y_test = test.NumberOfSales
                X_test = test.drop('NumberOfSales',axis = 1)

                X_train = X_train.drop('Date',axis=1)
                X_test = X_test.drop('Date',axis=1)
                
                #slightly better with this scaler
                scaler = RobustScaler()
                X_train_scaled=scaler.fit(X_train).transform(X_train)
                X_test_scaled=scaler.transform(X_test)
                reg =  regressor.fit(X_train_scaled,y_train)
                
                train_copy  = train_df.drop(['NumberOfSales'], axis=1)
                train_copy = train_copy.drop(['Date'],axis=1)
                feat_labels = train_copy.columns[0:]
                importances = reg.feature_importances_
                indices = np.argsort(importances)[::-1]
                for f in range(X_train_scaled.shape[1]):
                    print("%2d %-*s %f" %(f+1,30,feat_labels[indices[f]],importances[indices[f]]))
                
                current_score = reg.score(X_test_scaled,y_test)
                current_prediction = reg.predict(X_test_scaled)
                month_scores.append(current_score)
                predicted_list.append(current_prediction)
                y_test_list.append(y_test)
                
                if verbose == True:
                    print("Month {} has shape {}\n\t".format(month_name,test.shape))
                    print("-Score {}".format(current_score))
                
                
                real_df = train_df.copy()
                month_test_df = train_df[mask]
                month_train_df= train_df[~mask]

                month_test_df= month_test_df.drop('NumberOfSales',axis=1)
                prediction_=pd.Series(current_prediction)
                month_test_df['NumberOfSales']=prediction_.values.astype(int)
                predicted_df = pd.concat([month_test_df,month_train_df]).reset_index()
                predicted_df= predicted_df[list(train_df.columns.values)]
                
                predicted_df_list.append(predicted_df)
                training_df_list.append(train)
                
    return {
        'Scores' : month_scores,
        'Real' : y_test_list,
        'Predictions' : predicted_list,
        'Training_dates': training_months,
        'Training_df' : training_df_list,
        'Predicted_df' : predicted_df_list
    }


In [None]:
reg = RandomForestRegressor(n_estimators=50,n_jobs=-1)
prediction_result = monthly_training(verbose=True,
                                     training_interval=['05-2017','05-2017'],
                                     regressor = reg)

## 2  Method : Cross validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from collections import defaultdict
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import Lasso,Ridge,LinearRegression,RidgeCV,LassoCV

results = {}
sc= RobustScaler()
for storeid in train_df.StoreID.unique():
    train = train_df[train_df.StoreID == storeid]
    y_train = train.NumberOfSales
    X_train = train.drop('NumberOfSales',axis = 1)
    X_train = X_train.drop('Date',axis=1)
    X_train_scaled = sc.fit_transform(X_train)
    
    model = RandomForestRegressor(n_estimators=50,max_depth=5,n_jobs=-1)
    
    kfold = KFold(n_splits=10,shuffle = True, random_state=7)
    results[storeid] = cross_val_score(model, X_train_scaled, y_train, scoring='r2', cv=kfold)
    print('Cross-validation for {} -> score: {:.4f} with +/- {:.4f}'\
          .format(storeid,results[storeid].mean(),results[storeid].std()))

## Lanzi Error

In [453]:
# train by month
def split_dataset_bymonth(test_year, test_months, train_set):
    test_mask = (train.year == test_year) & train.month.isin(test_months)
    
    # define the train set
    train_dataset = train[~test_mask]
    x_train = train_dataset.drop('NumberOfSales', axis=1)
    y_train = train_dataset.NumberOfSales
    
    # define the test set
    test_dataset = train[test_mask]
    x_test = test_dataset.drop('NumberOfSales', axis=1)
    y_test = test_dataset.NumberOfSales
    
    return (x_train, y_train, x_test, y_test)

In [454]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [455]:
train_df = train_df.drop('Date',axis=1)

In [544]:
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

scores = {}
predictions = defaultdict(dict)
store_pred = {}
shopping_center_ids = [1129,1267,1280,1307,1330,1339,1357,1387,1676]
ids = train_df.StoreID.unique()
for storeid in ids:
    # define the model
    
    if storeid not in shopping_center_ids:
        model1 = Lasso(alpha=50)
        model2 = Ridge(alpha=1)
        model3 =XGBRegressor(max_depth=4,
                            gamma=0.05, 
                            learning_rate=0.05, 
                                 n_estimators=500,
                                 subsample=0.3, silent=1,
                                 random_state =7, nthread = -1)

        model4 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                       max_depth=2,loss='lad',random_state =5)


        model = AveragingModels(models = (model1,model2,model3,model4))


#     #model for shopping centers only -> makes lanzi error bigger althoug r^2 is lower..TOO BAD!
#     if storeid in shopping_center_ids:
#         model_shop1 = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
#                                     max_depth=2,loss='lad',random_state =5)
#         model_shop2 =XGBRegressor()
#         model= AveragingModels(models = (model_shop1,model_shop2))


    # split the dataset
    train = train_df[train_df.StoreID == storeid]
    
    x_train, y_train, x_test, y_test =\
    split_dataset_bymonth(2018, [1,2], train)
    
    # train the model with the training set
    model.fit(x_train, y_train)
    
    
    # scoring
    scores[storeid] = r2_score(y_test, model.predict(x_test))
    print('store {} -> {:.4f}'.format(storeid, scores[storeid]))
    store_pred[storeid] = scores[storeid]
    # predict the test set with the trained model
    for month in x_test.month.unique():
        # get daily predictions for each month in the test set
        month_prediction =model.predict(x_test[x_test.month == month])
        month_actual = y_test.loc[x_test[x_test.month == month].index].values
        
        # store the monthly mean of the test set
        predictions[storeid][month] = {
            'predicted': np.mean(month_prediction),
            'actual': np.mean(month_actual)
        }

store 1000 -> 0.9020
store 1001 -> 0.7405
store 1002 -> 0.9348
store 1003 -> 0.9174
store 1004 -> 0.9050
store 1005 -> 0.9204
store 1006 -> 0.8870
store 1007 -> 0.9157
store 1008 -> 0.5708
store 1009 -> 0.9349
store 1010 -> 0.8883
store 1011 -> 0.8643
store 1012 -> 0.9402
store 1013 -> 0.9475
store 1014 -> 0.8649
store 1015 -> 0.9036
store 1016 -> 0.8389
store 1017 -> 0.9380
store 1018 -> 0.9259
store 1019 -> 0.9331
store 1020 -> 0.8690
store 1021 -> 0.9261
store 1022 -> 0.9322
store 1023 -> 0.9180
store 1024 -> 0.9268
store 1025 -> 0.8697
store 1026 -> 0.9210
store 1027 -> 0.9317
store 1028 -> 0.8641
store 1029 -> 0.8908
store 1030 -> 0.8703
store 1031 -> 0.8742
store 1032 -> 0.9269
store 1033 -> 0.9529
store 1034 -> 0.8888
store 1035 -> 0.9219
store 1036 -> 0.9263
store 1037 -> 0.6556
store 1038 -> 0.9244
store 1039 -> 0.9560
store 1040 -> 0.9390
store 1041 -> 0.8876
store 1042 -> 0.8685
store 1043 -> 0.8908
store 1044 -> 0.9332
store 1045 -> 0.9171
store 1046 -> 0.9146
store 1047 ->

store 1391 -> 0.8759
store 1392 -> 0.9339
store 1393 -> 0.9529
store 1394 -> 0.9211
store 1395 -> 0.9156
store 1396 -> 0.9346
store 1397 -> 0.9141
store 1398 -> 0.9208
store 1399 -> 0.8961
store 1400 -> 0.9204
store 1401 -> 0.8609
store 1402 -> 0.9593
store 1403 -> 0.8918
store 1404 -> 0.9232
store 1405 -> 0.9271
store 1406 -> 0.8110
store 1407 -> 0.6429
store 1408 -> 0.8897
store 1409 -> 0.8642
store 1410 -> 0.8024
store 1411 -> 0.8769
store 1412 -> 0.9696
store 1413 -> 0.8571
store 1414 -> 0.9040
store 1415 -> 0.9190
store 1416 -> 0.8973
store 1417 -> 0.8811
store 1418 -> 0.9310
store 1419 -> 0.9063
store 1420 -> 0.9409
store 1421 -> 0.8955
store 1422 -> 0.4528
store 1423 -> 0.9509
store 1424 -> 0.8907
store 1425 -> 0.9656
store 1426 -> 0.9144
store 1427 -> 0.9118
store 1428 -> 0.9243
store 1429 -> 0.8121
store 1430 -> 0.8628
store 1431 -> 0.9251
store 1432 -> 0.8663
store 1433 -> 0.8643
store 1434 -> 0.9354
store 1435 -> 0.9167
store 1436 -> 0.8752
store 1437 -> 0.9393
store 1438 ->

In [545]:
#import operator
sorted_store_pred = sorted(store_pred.items(), key=operator.itemgetter(1))
# sorted_store_pred

In [546]:
# set of regions
R = sorted(train_df_orig.Region.unique().astype(int))
# set of predicted months
months = [key for key, value in predictions[1000].items()]
# set of stores by region
dict_store_byRegion = train_df_orig[['Region', 'StoreID']].drop_duplicates()\
.set_index('StoreID').groupby('Region').groups

# region_error inputs:
#
# int region = a number from 0 to 11
# dict predictions = {
#     int storeID: {
#         int month: {
#             str 'predicted': float,
#             str 'actual': float
#         }
#         ...
#     }
#     ...
# }
def region_error(region, predictions):    
    num = 0
    den = 0
    for store in dict_store_byRegion[str(region)]:
        for month in months:
            predicted = predictions[store][month]['predicted']
            actual = predictions[store][month]['actual']
            
            num += abs(actual - predicted)
            den += actual
    
    return num/den
    
# total_error input:

# region_errors = [0.3, 0.5, ... ]
def total_error(region_errors):
    return sum(region_errors)/len(region_errors)

def lanzi_error(predictions):
    region_errors = []
    for r in R:
        region_errors.append(region_error(r, predictions))
    
    return total_error(region_errors)

In [547]:
print('Lanzi error: {}'.format(lanzi_error(predictions)))

Lanzi error: 0.041999104189949205


## Plotting R^2 Error for each month

In [None]:
# os.system("say -v "+'Alice'+"Lanzi Merda")
# trace1 = go.Bar(
#             x=prediction_result['Training_dates'],
#             y=prediction_result['Scores'],
#             name='R^2 per region'
#     )

# trace2 = go.Scatter(x=prediction_result['Training_dates'],
#                     y=[np.asarray(prediction_result['Scores']).mean()]*len(prediction_result['Scores']),
#                     line = dict(color=('rgb(0, 0, 0)'),
#                     width=2, dash='dash',shape='hv'),
#                     name = 'Mean'
#                    )

# layout = go.Layout(
#     title= 'R^2',
#     yaxis=dict(
#         range=[0,1]
#     )
# )
# data=[trace1,trace2]
# fig = go.Figure(data=data, layout=layout)
# iplot(fig, filename='basic-bar')

## Predicted vs Real for march17

In [None]:
# regions=list(['Region_'+str(i) for i in range(0,11)])
# data_real=[]
# data_pred=[]
# for region in regions:
#     region_df_real = prediction_result['Training_df'][0][prediction_result['Training_df'][0][region]==1]
#     region_df_pred = prediction_result['Predicted_df'][0][prediction_result['Predicted_df'][0][region]==1]
#     sales_real = region_df_real['NumberOfSales'].sum()
#     sales_pred= region_df_pred['NumberOfSales'].sum()
#     data_real.append(sales_real)
#     data_pred.append(sales_pred)

# trace_r=go.Bar(x=np.arange(0,12),
#                 y=data_real,
#                 name='Real sales'
#                 )

# trace_p=go.Bar(x=np.arange(0,12),
#                  y=data_pred,
#                  name='Preicted Sales')

# data_tot = [trace_r, trace_p]
# layout = go.Layout(
#     barmode='group',
#     title='Real vs Predicted sales per region in March17',
#     yaxis=dict(title='Sales'),
#     xaxis=dict(title='Region')
# )

# fig = go.Figure(data=data_tot, layout=layout)
# iplot(fig, filename='Population info')