# Random Forest

In [70]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
from datetime import datetime
from datetime import timedelta
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
import  plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from sklearn.ensemble import RandomForestRegressor
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import os
import colorlover  as cl
from tqdm import tqdm
from scipy.special import boxcox1p
from scipy.stats import skew


# environment settings
data_path_out = 'Data/output/'

#hack to avoid showing deprecationg warnings
warnings.filterwarnings("ignore") 
init_notebook_mode(connected=True)


# Deserialize previously saved data from "preprocessing"
with open(data_path_out+'train_pp.obj', 'rb') as train_pp:
    train_df = pickle.load(train_pp)

#Dummies
train_df = pd.get_dummies(train_df)
train_df.shape

(523021, 73)

## Check that all closed stores don't have sales

In [71]:
train_df[(train_df['IsOpen'] == False) & (train_df['NumberOfSales']>0)]

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,...,Events_Rain-Hail,Events_Rain-Hail-Thunderstorm,Events_Rain-Snow,Events_Rain-Snow-Hail,Events_Rain-Snow-Hail-Thunderstorm,Events_Rain-Snow-Thunderstorm,Events_Rain-Thunderstorm,Events_Snow,Events_Snow-Hail,Events_Thunderstorm


## New time features

In [72]:
train_df['day'] = train_df.Date.dt.day
train_df['month'] = train_df.Date.dt.month
train_df['year'] = train_df.Date.dt.year
train_df['WeekOfYear'] = train_df.Date.dt.week
train_df['DaysInMonth'] = train_df.Date.dt.daysinmonth
train_df['DayOfYear'] = train_df.Date.dt.dayofyear

## New sales features

In [73]:

def num_of_features(regional_week_year,week,year,column_of_interest):
    num_holidays=0

    if week ==1:
        week_before=[52,year-1]
        week_after=[2,year]
    if week == 52:
        week_after=[1,year+1]
        week_before=[51,year]
    if (week>1) & (week<52):
        week_before=[week-1,year]
        week_after=[week+1,year]
    
    before_df = regional_week_year[(regional_week_year['WeekOfYear']==week_before[0])&
                                  (regional_week_year['year']==week_before[1])]
    
    after_df = regional_week_year[(regional_week_year['WeekOfYear']==week_after[0])&
                                  (regional_week_year['year']==week_after[1])]
    this_df = regional_week_year[(regional_week_year['WeekOfYear']==week)&
                                 (regional_week_year['year']==year)]
    
    num_ago = before_df[before_df[column_of_interest]][column_of_interest].sum()
    num_next = after_df[after_df[column_of_interest]][column_of_interest].sum()
    num_this = this_df[this_df[column_of_interest]][column_of_interest].sum()
    
    return num_ago,num_next,num_this

def create_new_features(before,current,after,column_of_interest):
    train_df[before] = 0
    train_df[current]=0
    train_df[after] = 0

    region_list= ["Region_"+str(d) for d in range(0,11)]
    for region in tqdm(region_list):
        curr_region = train_df[train_df[region]==1]

        #get all valid dates of that region
        regional_week_year=curr_region[['WeekOfYear','year',column_of_interest]]

        #get all store ids of that region
        regional_stores=len(curr_region['StoreID'].unique())

        week_year_list =regional_week_year[['WeekOfYear','year']].drop_duplicates().values.tolist()

        for date in week_year_list:
            num_ago,num_next,num_this = num_of_features(regional_week_year,
                                                        date[0],
                                                        date[1],
                                                        column_of_interest)

            train_df.at[((train_df['WeekOfYear']==date[0])&
                         (train_df['year']==date[1])),before]=int(num_ago/regional_stores)
            train_df.at[((train_df['WeekOfYear']==date[0])&
                         (train_df['year']==date[1])),after]=int(num_next/regional_stores)
            train_df.at[((train_df['WeekOfYear']==date[0])&
                         (train_df['year']==date[1])),current]=int(num_this/regional_stores)

In [74]:
create_new_features('HolidaysWeekBefore','HolidaysWeekCurrent','HolidaysWeekAfter','IsHoliday')
create_new_features('PromoWeekBefore','PromoWeekCurrent','PromoWeekAfter','HasPromotions')

100%|██████████| 11/11 [01:32<00:00,  8.38s/it]
100%|██████████| 11/11 [01:31<00:00,  8.29s/it]


## Drop NumberOfCustomers

In [75]:
train_df =train_df.drop('NumberOfCustomers',axis=1)

## Drop closed stores

In [76]:
# mask_closed = (train_df['IsOpen']==False)
# train_df=train_df[~mask_closed]
# train_df.shape

## Date dictionary

In [77]:
all_dates = pd.DatetimeIndex(train_df['Date'])
month_dict = { d.strftime("%B%Y") :datetime.strptime(d.strftime('%m-%Y'),'%m-%Y') for d in all_dates}

## PCA -> useless

In [78]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

train_df2 = train_df.drop('Date',axis=1)
sc= StandardScaler()
train_scaled = sc.fit_transform(train_df2.values)
cov_mat = np.cov(train_scaled.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
tot = sum(eigen_vals)
var_exp = [(i/tot) for i in sorted(eigen_vals,reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print(cum_var_exp)

[0.0885402  0.14312627 0.19039511 0.23353448 0.27249089 0.3092255
 0.34054463 0.36782221 0.39163594 0.41513407 0.43701514 0.45759482
 0.47674043 0.49459287 0.51230881 0.52901529 0.54466006 0.5600574
 0.5750247  0.58932052 0.60286445 0.61623207 0.62946464 0.64258977
 0.6553859  0.66780464 0.6801545  0.69239363 0.70459161 0.71675621
 0.72888809 0.7409871  0.75306151 0.76511292 0.77713714 0.78912656
 0.80109447 0.81289145 0.82454647 0.83612541 0.84762407 0.85891663
 0.86910631 0.87902717 0.88835199 0.89729907 0.90609942 0.91444323
 0.92250559 0.93025305 0.93763962 0.94446129 0.95101731 0.95748109
 0.96349962 0.96903134 0.97384288 0.97827558 0.98252482 0.98629139
 0.98967567 0.99278265 0.99448472 0.9959177  0.99718424 0.99804833
 0.99866456 0.99909162 0.9994921  0.99969113 0.99981823 0.99989784
 0.99997238 0.99999988 1.         1.         1.         1.
 1.         1.         1.         1.         1.        ]


## 1. Method Training and predicting one-hold out 

In [95]:
from sklearn.decomposition import PCA

def monthly_training(training_interval=['03-2017','02-2018'],
                     verbose=False,
                     regressor=RandomForestRegressor(n_estimators=50,n_jobs=-1)):
    
    init_date=datetime.strptime(training_interval[0],'%m-%Y')
    end_date =datetime.strptime(training_interval[1],'%m-%Y')
    
    month_scores = []
    y_test_list = []
    predicted_list= []
    predicted_df_list = []
    training_months = []
    training_df_list = []
    for month_name, date in month_dict.items():

        if (date >= init_date) & (date<= end_date) :
                if verbose== True:
                    print("Doing month",month_name)
                    
                training_months.append(month_name)
                mask = ((train_df['month']==date.month) & (train_df['year']==date.year))
                test = train_df[mask]
                train = train_df[~mask]
                
                y_train = train.NumberOfSales
                X_train = train.drop('NumberOfSales',axis = 1)

                y_test = test.NumberOfSales
                X_test = test.drop('NumberOfSales',axis = 1)

                X_train = X_train.drop('Date',axis=1)
                X_test = X_test.drop('Date',axis=1)
                
                #slightly better with this scaler
                scaler = RobustScaler()
                X_train_scaled=scaler.fit(X_train).transform(X_train)
                X_test_scaled=scaler.transform(X_test)
                reg =  regressor.fit(X_train_scaled,y_train)
                
                train_copy  = train_df.drop(['NumberOfSales'], axis=1)
                train_copy = train_copy.drop(['Date'],axis=1)
                feat_labels = train_copy.columns[0:]
                importances = reg.feature_importances_
                indices = np.argsort(importances)[::-1]
                for f in range(X_train_scaled.shape[1]):
                    print("%2d %-*s %f" %(f+1,30,feat_labels[indices[f]],importances[indices[f]]))
                
                current_score = reg.score(X_test_scaled,y_test)
                current_prediction = reg.predict(X_test_scaled)
                month_scores.append(current_score)
                predicted_list.append(current_prediction)
                y_test_list.append(y_test)
                
                if verbose == True:
                    print("Month {} has shape {}\n\t".format(month_name,test.shape))
                    print("-Score {}".format(current_score))
                
                
                real_df = train_df.copy()
                month_test_df = train_df[mask]
                month_train_df= train_df[~mask]

                month_test_df= month_test_df.drop('NumberOfSales',axis=1)
                prediction_=pd.Series(current_prediction)
                month_test_df['NumberOfSales']=prediction_.values.astype(int)
                predicted_df = pd.concat([month_test_df,month_train_df]).reset_index()
                predicted_df= predicted_df[list(train_df.columns.values)]
                
                predicted_df_list.append(predicted_df)
                training_df_list.append(train)
                
    return {
        'Scores' : month_scores,
        'Real' : y_test_list,
        'Predictions' : predicted_list,
        'Training_dates': training_months,
        'Training_df' : training_df_list,
        'Predicted_df' : predicted_df_list
    }


In [98]:
reg = RandomForestRegressor(n_estimators=50,n_jobs=-1)
prediction_result = monthly_training(verbose=True,
                                     training_interval=['05-2017','05-2017'],
                                     regressor = reg)

Doing month May2017
 1 IsOpen                         0.453292
 2 NearestCompetitor              0.124347
 3 StoreID                        0.112367
 4 HasPromotions                  0.071877
 5 DayOfYear                      0.017707
 6 Region_PD                      0.017307
 7 Region_AreaKM2                 0.015544
 8 day                            0.014602
 9 AssortmentType_General         0.012508
10 Region_GDP                     0.011498
11 Region_PopulationK             0.009812
12 month                          0.009215
13 StoreType_Hyper Market         0.008035
14 StoreType_Standard Market      0.007709
15 WindDirDegrees                 0.007016
16 StoreType_Shopping Center      0.006158
17 StoreType_Super Market         0.005962
18 Min_Humidity                   0.004555
19 AssortmentType_With Non-Food Department 0.004430
20 HolidaysWeekCurrent            0.004266
21 PromoWeekCurrent               0.004231
22 Max_TemperatureC               0.003735
23 Max_Sea_Level_Pressure

## 2  Method : Cross validation

In [116]:
from xgboost.sklearn import XGBRegressor
from mlxtend.regressor import StackingRegressor,StackingCVRegressor
from sklearn.svm import SVR

results = {}
sc= RobustScaler()
for storeid in train_df.StoreID.unique():
    train = train_df[train_df.StoreID == storeid]
    y_train = train.NumberOfSales
    X_train = train.drop('NumberOfSales',axis = 1)
    X_train = X_train.drop('Date',axis=1)
    X_train_scaled = sc.fit_transform(X_train)
    
    model = RandomForestRegressor(n_estimators=500,n_jobs=-1)
    model2 =XGBRegressor(silent=False,n_jobs=-1)
    stregr = StackingRegressor(regressors=[model,model2], 
                           meta_regressor=)


    kfold = KFold(n_splits=10,shuffle = True, random_state=7)
    results[storeid] = cross_val_score(stregr, X_train_scaled, y_train, scoring='r2', cv=kfold)
    print('Cross-validation for {} -> score: {:.4f} with +/- {:.4f}'\
          .format(storeid,results[storeid].mean(),results[storeid].std()))

Cross-validation for 1000 -> score: -0.0358 with +/- 0.0355
Cross-validation for 1001 -> score: -0.0332 with +/- 0.0275
Cross-validation for 1002 -> score: -0.0491 with +/- 0.0368
Cross-validation for 1003 -> score: -0.0380 with +/- 0.0358


KeyboardInterrupt: 

## Feature importance

In [86]:
rf = RandomForestRegressor(n_estimators=50,n_jobs=-1).fit(X_scaled,y)

train_copy  = train_df.drop(['NumberOfSales'], axis=1)
feat_labels = train_copy.columns[0:]
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_scaled.shape[1]):
    print("%2d %-*s %f" %(f+1,30,feat_labels[indices[f]],importances[indices[f]]))


 1 Region_PD                      0.999999
 2 Max_Sea_Level_PressurehPa      0.000000
 3 HolidaysWeekAfter              0.000000
 4 Min_Dew_PointC                 0.000000
 5 Max_Humidity                   0.000000
 6 Date                           0.000000
 7 DayOfYear                      0.000000
 8 Mean_Sea_Level_PressurehPa     0.000000
 9 WeekOfYear                     0.000000
10 Max_TemperatureC               0.000000
11 Region_PopulationK             0.000000
12 Min_Humidity                   0.000000
13 Mean_Dew_PointC                0.000000
14 month                          0.000000
15 Mean_TemperatureC              0.000000
16 Max_Dew_PointC                 0.000000
17 Max_VisibilityKm               0.000000
18 Precipitationmm                0.000000
19 Mean_Humidity                  0.000000
20 Events_Rain                    0.000000
21 Min_Sea_Level_PressurehPa      0.000000
22 NearestCompetitor              0.000000
23 Mean_VisibilityKm              0.000000
24 Max_Wind

## Plotting R^2 Error for each month

In [97]:
# os.system("say -v "+'Alice'+" Ho fatto")
trace1 = go.Bar(
            x=prediction_result['Training_dates'],
            y=prediction_result['Scores'],
            name='R^2 per region'
    )

trace2 = go.Scatter(x=prediction_result['Training_dates'],
                    y=[np.asarray(prediction_result['Scores']).mean()]*len(prediction_result['Scores']),
                    line = dict(color=('rgb(0, 0, 0)'),
                    width=2, dash='dash',shape='hv'),
                    name = 'Mean'
                   )

layout = go.Layout(
    title= 'R^2',
    yaxis=dict(
        range=[0,1]
    )
)
data=[trace1,trace2]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')

## Predicted vs Real for march17

In [62]:
regions=list(['Region_'+str(i) for i in range(0,11)])
data_real=[]
data_pred=[]
for region in regions:
    region_df_real = prediction_result['Training_df'][0][prediction_result['Training_df'][0][region]==1]
    region_df_pred = prediction_result['Predicted_df'][0][prediction_result['Predicted_df'][0][region]==1]
    sales_real = region_df_real['NumberOfSales'].sum()
    sales_pred= region_df_pred['NumberOfSales'].sum()
    data_real.append(sales_real)
    data_pred.append(sales_pred)

trace_r=go.Bar(x=np.arange(0,12),
                y=data_real,
                name='Real sales'
                )

trace_p=go.Bar(x=np.arange(0,12),
                 y=data_pred,
                 name='Preicted Sales')

data_tot = [trace_r, trace_p]
layout = go.Layout(
    barmode='group',
    title='Real vs Predicted sales per region in March17',
    yaxis=dict(title='Sales'),
    xaxis=dict(title='Region')
)

fig = go.Figure(data=data_tot, layout=layout)
iplot(fig, filename='Population info')