In [73]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('../src')
from helperFunctions import *

%matplotlib inline

# Load input data

In [74]:
train = pd.read_csv("../data/train.csv")
store = pd.read_csv("../data/store.csv")

merged_train_store = pd.merge(store, train, on='Store', how='left')

# Date
merged_train_store.Date = pd.to_datetime(merged_train_store.Date)

nonzero_sales['Month'] = nonzero_sales['Date'].dt.month
nonzero_sales['DayOfMonth'] = nonzero_sales['Date'].dt.day
nonzero_sales['Year'] = nonzero_sales['Date'].dt.year
nonzero_sales['DayOfWeek'] = nonzero_sales['Date'].dt.dayofweek
nonzero_sales['WeekOfYear'] = nonzero_sales['Date'].dt.weekofyear

  interactivity=interactivity, compiler=compiler, result=result)


# Drop rows with zero sales data

In [75]:
zero_sales = merged_train_store.Sales == 0
nonzero_sales = merged_train_store.loc[~zero_sales]

print(f'Drop {sum(zero_sales)} rows, keep {nonzero_sales.shape[0]}')

Drop 102652 rows, keep 515821


# Add 'Sales_per_customer' column

In [76]:
group = nonzero_sales.groupby(by='Store').agg({'Sales': 'mean', 'Customers': 'mean'})
group['Sales_per_customer'] = group['Sales'] / group['Customers']
nonzero_sales['Sales_per_customer'] = nonzero_sales['Store'].map(group['Sales_per_customer'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Outlier handling

In [26]:
# Filter out columns where the competitionsinceyear is below 1990
mask_1990 = nonzero_sales.CompetitionOpenSinceYear < 1990
nonzero_sales = nonzero_sales[~mask_1990]

# Missing value imputation

In [77]:
missing_report(nonzero_sales, pd)

Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1361,0.26,float64
CompetitionOpenSinceMonth,163889,31.77,float64
CompetitionOpenSinceYear,163889,31.77,float64
Promo2,0,0.0,int64
Promo2SinceWeek,254114,49.26,float64
Promo2SinceYear,254114,49.26,float64
PromoInterval,254114,49.26,object


### 'Promo', 'Promo2'
Drop null values

In [78]:
missing_promo = nonzero_sales.Promo.isnull() | nonzero_sales.Promo.isnull()
nonzero_sales = nonzero_sales.loc[~missing_promo, :]

print(f'Drop {sum(missing_promo)} rows, keep {nonzero_sales.shape[0]}')
missing_report(nonzero_sales, pd)

Drop 15504 rows, keep 500317


Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1308,0.26,float64
CompetitionOpenSinceMonth,158950,31.77,float64
CompetitionOpenSinceYear,158950,31.77,float64
Promo2,0,0.0,int64
Promo2SinceWeek,246526,49.27,float64
Promo2SinceYear,246526,49.27,float64
PromoInterval,246526,49.27,object


### 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'
Apply mean imputation for missing values

In [79]:
nonzero_sales = mean_imputation(nonzero_sales,['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])

print(f'Drop {0} rows, keep {nonzero_sales.shape[0]}')
missing_report(nonzero_sales, pd)

Drop 0 rows, keep 500317


Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1308,0.26,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,246526,49.27,float64
Promo2SinceYear,246526,49.27,float64
PromoInterval,246526,49.27,object


### 'Promo2SinceWeek', 'Promo2SinceYear',  'PromoInterval'
Apply mean imputation for stores participating and constant imputation with zero for stores not participating

In [80]:
Promo2 = nonzero_sales.Promo2 == 0
nonzero_sales.loc[Promo2, :] = const_imputation(nonzero_sales.loc[Promo2, :],
                                                ['Promo2SinceWeek', 'Promo2SinceYear'],
                                                values=0)

nonzero_sales.loc[~Promo2, :] = mean_imputation(nonzero_sales.loc[~Promo2, :],
                                                ['Promo2SinceWeek', 'Promo2SinceYear'],
                                                enforce_int=True)

nonzero_sales = const_imputation(nonzero_sales,['PromoInterval'],values='unavailable')

print(f'Drop {0} rows, keep {nonzero_sales.shape[0]}')
missing_report(nonzero_sales, pd)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Drop 0 rows, keep 500317


Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1308,0.26,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,0,0.0,float64
Promo2SinceYear,0,0.0,float64
PromoInterval,0,0.0,object


### 'Sales', 'Customers', 'Open'
Drop rows with missing 'Sales' in case 'Open' or 'Customers' is zero

In [81]:
not_open = nonzero_sales.Open==0
no_customer = nonzero_sales.Customers==0
missing_sales = nonzero_sales.Sales.isnull()

nonzero_sales = nonzero_sales.loc[~((not_open | no_customer) & missing_sales), :]

print(f'Drop {sum(((not_open | no_customer) & missing_sales))} rows, keep {nonzero_sales.shape[0]}')
missing_report(nonzero_sales, pd)

Drop 3006 rows, keep 497311


Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1299,0.26,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,0,0.0,float64
Promo2SinceYear,0,0.0,float64
PromoInterval,0,0.0,object


Impute 'Sales' from 'Customers' and 'Sales_per_customer'

In [82]:
missing_sales = nonzero_sales.Sales.isnull()

nonzero_sales.loc[missing_sales, 'Sales'] = nonzero_sales.loc[missing_sales, 'Customers'] * nonzero_sales.loc[missing_sales, 'Sales_per_customer']

missing_sales = nonzero_sales.Sales.isnull()
nonzero_sales = nonzero_sales.loc[~missing_sales, :]

print(f'Drop {sum(missing_sales)} rows, keep {nonzero_sales.shape[0]}')
missing_report(nonzero_sales, pd)

Drop 451 rows, keep 496860


Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1299,0.26,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,0,0.0,float64
Promo2SinceYear,0,0.0,float64
PromoInterval,0,0.0,object


### 'StateHoliday'
Look up if 'Date' is a national holiday and impute missing values with 'a' and '0' accordingly

In [112]:
import holidays

years = nonzero_sales.Date.dt.year.unique()
national_holidays = [day for day in holidays.Germany(years=years)]

missing_holiday = nonzero_sales.StateHoliday.isnull()
holiday_date = nonzero_sales.Date.isin(national_holidays)


nonzero_sales.loc[missing_holiday & holiday_date, :] = 'a'
nonzero_sales.loc[missing_holiday & ~holiday_date, :] = '0'


print(f'Drop {0} rows, keep {nonzero_sales.shape[0]}')
missing_report(nonzero_sales, pd)

Drop 0 rows, keep 496860


Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,object
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1259,0.25,object
CompetitionOpenSinceMonth,0,0.0,object
CompetitionOpenSinceYear,0,0.0,object
Promo2,0,0.0,object
Promo2SinceWeek,0,0.0,object
Promo2SinceYear,0,0.0,object
PromoInterval,0,0.0,object


# Drop columns 

In [113]:
nonzero_sales.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Date,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Sales_per_customer
1,1,c,a,1270,9,2008,0,0,0,unavailable,2013-01-02 00:00:00,3,5530,668,,0,0,1,8.35515
2,1,c,a,1270,9,2008,0,0,0,unavailable,2013-01-03 00:00:00,4,4327,578,1.0,0,0,1,8.35515
3,1,c,a,1270,9,2008,0,0,0,unavailable,2013-01-04 00:00:00,5,4486,619,1.0,0,0,1,8.35515
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0
6,1,c,a,1270,9,2008,0,0,0,unavailable,2013-01-07 00:00:00,1,7176,785,1.0,1,0,1,8.35515


# Encoding

In [None]:
# Store
nonzero_sales, fenq_values = freq_encoding(nonzero_sales, 'Store')
nonzero_sales, menc_values = mean_encoding(nonzero_sales, 'Store')
nonzero_sales

In [None]:
# StoreType: OneHote
nonzero_sales = pd.get_dummies(nonzero_sales, columns=['StoreType'], drop_first=True)

In [None]:
# Assortment
nonzero_sales = ordinal_encoding(nonzero_sales, 'Assortment', {'a':1, 'b':2, 'c':3})

In [None]:
missing_report(nonzero_sales)

In [None]:
# Date
nonzero_sales['Month'] = nonzero_sales['Date'].dt.month
nonzero_sales['DayOfMonth'] = nonzero_sales['Date'].dt.day
nonzero_sales['Year'] = nonzero_sales['Date'].dt.year
nonzero_sales['DayOfWeek'] = nonzero_sales['Date'].dt.dayofweek
nonzero_sales['WeekOfYear'] = nonzero_sales['Date'].dt.weekofyear

In [None]:
nonzero_sales.isnull().sum()

In [None]:
nonzero_sales[]

In [None]:
# CompetitionOpenSince[Month / Year]
nonzero_sales['CompetitionOpenSincePeriod'] = (12 * (nonzero_sales['Year'] -
                                                     nonzero_sales['CompetitionOpenSinceYear']
                                                    )
                                              ) + (nonzero_sales['Month'] - 
                                                   nonzero_sales['CompetitionOpenSinceMonth']
                                                  )

In [None]:
# Promo2Since[Week / Year]
nonzero_sales['Promo2SincePeriod'] = (52 * (nonzero_sales['Year'] - 
                                            nonzero_sales['Promo2SinceYear']
                                           ) + (nonzero_sales['WeekOfYear'] - 
                                                nonzero_sales['Promo2SinceWeek']
                                               )
                                     )

In [None]:
# PromoInterval
nonzero_sales = pd.get_dummies(nonzero_sales, 
                               columns=['PromoInterval'], 
                               drop_first=True)

In [None]:
# StateHoliday
state_holiday_enc = {'0':0, 0.0:0, 'a':1, 'b':1, 'c':1}
nonzero_sales['StateHoliday_benc'] = nonzero_sales.StateHoliday.map(state_holiday_enc)

# Drop the encoded columns

In [None]:
drop_encoded_cols = ['Store', 'Assortment', 'Date', 
                     'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
                     'Promo2SinceWeek', 'Promo2SinceYear', 
                     'StateHoliday']

In [None]:
final_df = nonzero_sales.drop(drop_encoded_cols, axis=1)

# Train test split

In [None]:
X = final_df.drop('Sales', axis=1)
y = final_df[['Sales']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Model

In [None]:
rf = RandomForestRegressor(n_estimators=100, 
                           min_samples_leaf=3, 
                           max_depth=5, 
                           random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [None]:
y_pred = rf.predict(X_test)

rf_base_metric = metric(y_pred, y_test.values)

print("Metric for baseline prediction = {}".format(rf_base_metric))