# Import packages and own modules

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import holidays

import sys
sys.path.append('../src')
from helperFunctions import *

%matplotlib inline

# 1. Load input data
## 1.1 Load and merge

In [2]:
train = pd.read_csv("../data/train.csv")
store = pd.read_csv("../data/store.csv")
merged_train_store = pd.merge(store, train, on='Store', how='left')

  interactivity=interactivity, compiler=compiler, result=result)


## 1.2 Train test split

In [3]:
data_train, data_test = train_test_split(merged_train_store, test_size=0.2, random_state=42)

## 1.3 Convert 'Date' column and generate generic date features

In [4]:
data_train.loc[:, 'Date'] = pd.to_datetime(data_train.loc[:, 'Date'])
data_train.loc[:, 'Month'] = data_train['Date'].dt.month
data_train.loc[:, 'DayOfMonth'] = data_train['Date'].dt.day
data_train.loc[:, 'Year'] = data_train['Date'].dt.year
data_train.loc[:, 'DayOfWeek'] = data_train['Date'].dt.dayofweek
data_train.loc[:, 'WeekOfYear'] = data_train['Date'].dt.weekofyear

data_test.loc[:, 'Date'] = pd.to_datetime(data_test.loc[:, 'Date'])
data_test.loc[:, 'Month'] = data_test['Date'].dt.month
data_test.loc[:,'DayOfMonth'] = data_test['Date'].dt.day
data_test.loc[:,'Year'] = data_test['Date'].dt.year
data_test.loc[:,'DayOfWeek'] = data_test['Date'].dt.dayofweek
data_test.loc[:,'WeekOfYear'] = data_test['Date'].dt.weekofyear

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
  
  del sys.path[0]


## 1.4 Drop rows based on 'Sales' column 
#### Drop rows with value zero (training set only)

In [5]:
zero_sales = data_train.Sales == 0
data_train = data_train.loc[~zero_sales]

print(f'Drop {sum(zero_sales)} rows, keep {data_train.shape[0]}')

Drop 81986 rows, keep 412792


#### Drop rows with values zero or null

In [6]:
zero_sales = data_test.Sales == 0
missing_sales = data_test.Sales.isnull()
data_test = data_test.loc[~(zero_sales | missing_sales)]

print(f'Drop {sum(zero_sales)} rows, keep {data_test.shape[0]}')

Drop 20666 rows, keep 99397


## 1.5 Compute 'Sales_per_customer' column
#### Compute sales per customer for each store from training set data

In [7]:
group = data_train.groupby(by='Store').agg({'Sales': 'mean', 'Customers': 'mean'})
group['Sales_per_customer'] = group['Sales'] / group['Customers']

#### Map sales per customer to training and test set

In [8]:
data_train['Sales_per_customer'] = data_train['Store'].map(group['Sales_per_customer'])
data_test['Sales_per_customer'] = data_test['Store'].map(group['Sales_per_customer'])

# 2. Outlier handling
## 2.1 'CompetitionOpenSinceYear'
#### Drop values with year before 1990

In [9]:
data_train.CompetitionOpenSinceYear.value_counts()

2013.0    30968
2012.0    30297
2014.0    25910
2005.0    22882
2010.0    20546
2011.0    20284
2009.0    19880
2008.0    19874
2007.0    17655
2006.0    17362
2015.0    13883
2002.0    10086
2004.0     8111
2003.0     6869
2001.0     5849
2000.0     3685
1999.0     3019
1990.0     1869
1995.0      739
1994.0      725
1961.0      386
1998.0      358
1900.0      350
Name: CompetitionOpenSinceYear, dtype: int64

In [10]:
before_1990 = data_train.CompetitionOpenSinceYear < 1990
data_train = data_train.loc[~before_1990, :]
print(f'Drop {sum(before_1990)} rows, keep {data_train.shape[0]}')

before_1990 = data_test.CompetitionOpenSinceYear < 1990
data_test = data_test.loc[~before_1990 < 1990, :]

Drop 736 rows, keep 412056


# 3. Missing value imputation
#### Overview before imputing missing values

In [11]:
missing_report(data_train, pd)

Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,1099,0.27,float64
CompetitionOpenSinceMonth,131205,31.84,float64
CompetitionOpenSinceYear,131205,31.84,float64
Promo2,0,0.0,int64
Promo2SinceWeek,203216,49.32,float64
Promo2SinceYear,203216,49.32,float64
PromoInterval,203216,49.32,object


## 3.1 'Promo', 'Promo2'
#### Drop all null values

In [12]:
missing_promo = data_train.Promo.isnull() | data_train.Promo2.isnull()
data_train = data_train.loc[~missing_promo, :]

print(f'Drop {sum(missing_promo)} rows, keep {data_train.shape[0]}')

missing_promo = data_test.Promo.isnull() | data_test.Promo2.isnull()
data_test = data_test.loc[~missing_promo, :]

Drop 12380 rows, keep 399676


## 3.2 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'
#### Apply mean imputation for missing values

In [13]:
data_train = mean_imputation(data_train,['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])
data_test = mean_imputation(data_test,['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])

print(f'Drop {0} rows, keep {data_train.shape[0]}')

Drop 0 rows, keep 399676


## 3.3 'Promo2SinceWeek', 'Promo2SinceYear',  'PromoInterval'
#### Apply mean imputation for stores participating and constant imputation with zero for stores not participating

In [14]:
data_train = mean_imputation(data_train, ['Promo2SinceWeek', 'Promo2SinceYear'], enforce_int=True)
Promo2 = data_train.Promo2 == 1
data_train.loc[~Promo2, ['Promo2SinceWeek', 'Promo2SinceYear']] = 0
data_train = const_imputation(data_train,['PromoInterval'],values='unavailable')

data_test = mean_imputation(data_test, ['Promo2SinceWeek', 'Promo2SinceYear'], enforce_int=True)
Promo2 = data_test.Promo2 == 1
data_test.loc[~Promo2, ['Promo2SinceWeek', 'Promo2SinceYear']] = 0
data_test = const_imputation(data_test,['PromoInterval'],values='unavailable')

print(f'Drop {0} rows, keep {data_train.shape[0]}')

Drop 0 rows, keep 399676


## 3.4 'Sales', 'Customers', 'Open'
#### Drop rows with missing 'Sales' in case 'Open' or 'Customers' is zero (training set only)

In [15]:
not_open = data_train.Open==0
no_customer = data_train.Customers==0
missing_sales = data_train.Sales.isnull()

data_train = data_train.loc[~((not_open | no_customer) & missing_sales), :]

print(f'Drop {sum(((not_open | no_customer) & missing_sales))} rows, keep {data_train.shape[0]}')

Drop 2410 rows, keep 397266


#### Impute 'Sales' from 'Customers' and 'Sales_per_customer' (training set only)

In [16]:
missing_sales = data_train.Sales.isnull()

data_train.loc[missing_sales, 'Sales'] = data_train.loc[missing_sales, 'Customers'] * data_train.loc[missing_sales, 'Sales_per_customer']

missing_sales = data_train.Sales.isnull()
data_train = data_train.loc[~missing_sales, :]

print(f'Drop {sum(missing_sales)} rows, keep {data_train.shape[0]}')

data_train = const_imputation(data_train,['Open'],values=1)

Drop 364 rows, keep 396902


#### Drop rows with missing or zero 'Open' (test set only)

In [17]:
not_open = data_test.Open==0
missing_open = data_test.Open.isna()

data_test = data_test.loc[~(not_open | missing_open), :]

print(f'Drop {sum(not_open | missing_open)} rows, keep {data_test.shape[0]}')

Drop 3010 rows, keep 93373


## 3.5 'StateHoliday'
#### Look up if 'Date' is a national holiday and impute missing values with 'a' and '0' accordingly

In [18]:
data_train = holiday_imputation(data_train, holidays)
data_test = holiday_imputation(data_test, holidays)

## 3.6 'CompetitionDistance'
#### Drop rows with missing values

In [19]:
missing_distance = data_train.CompetitionDistance.isnull()
data_train = data_train.loc[~missing_distance, :]

print(f'Drop {sum(missing_distance)} rows, keep {data_train.shape[0]}')

missing_distance = data_test.CompetitionDistance.isnull()
data_test = data_test.loc[~missing_distance, :]

Drop 1055 rows, keep 395847


## 3.6 Drop columns 'Customers', 'Open' and 'SchoolHoliday'

In [20]:
data_train = data_train.drop(columns = ['Customers', 'Open', 'SchoolHoliday'])
data_test = data_test.drop(columns = ['Customers', 'Open', 'SchoolHoliday'])
missing_report(data_train, pd)

Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,0,0.0,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,0,0.0,float64
Promo2SinceYear,0,0.0,float64
PromoInterval,0,0.0,object


In [21]:
missing_report(data_test, pd)

Unnamed: 0,Null (total),Null (percent),Type
Store,0,0.0,int64
StoreType,0,0.0,object
Assortment,0,0.0,object
CompetitionDistance,0,0.0,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,0,0.0,float64
Promo2SinceYear,0,0.0,float64
PromoInterval,0,0.0,object


# 4. Encoding of categorical variables
## 4.1 'Store'
#### Mean and frequency encoding

In [22]:
data_train, mean_values = mean_encoding(data_train, 'Store')
data_train, fenc_values = freq_encoding(data_train, 'Store')

data_test.loc[:, 'Store_menc'] = data_test['Store'].map(mean_values)
data_test.loc[:, 'Store_fenc'] = data_test['Store'].map(fenc_values)

## 4.2 'StoreType'
#### One hot encoding
# WILL 'drop_first' DROP THE SAME COLUMN IN BOTH DATAFRAMES????? 

In [23]:
#data_train = pd.get_dummies(data_train, columns=['StoreType'], drop_first=True)
#data_test = pd.get_dummies(data_test, columns=['StoreType'], drop_first=True)
data_train = pd.get_dummies(data_train, columns=['StoreType'], drop_first=False)
data_test = pd.get_dummies(data_test, columns=['StoreType'], drop_first=False)

## 4.3 'Assortment'
#### Ordinal encoding according to assortment levels: a = basic, b = extra, c = extended

In [24]:
data_train = ordinal_encoding(data_train, 'Assortment', {'a':1, 'b':2, 'c':3})
data_test = ordinal_encoding(data_test, 'Assortment', {'a':1, 'b':2, 'c':3})

## 4.4 'StateHoliday'
#### Binary encoding

In [25]:
data_train = binary_encoding(data_train, 'StateHoliday', positive_list=['a', 'b', 'c'])
data_test = binary_encoding(data_test, 'StateHoliday', positive_list=['a', 'b', 'c'])

## 4.5 Drop encoded columns and null values from test set

In [26]:
data_train = data_train.drop(columns=['Store', 'Assortment', 'StateHoliday'])
data_test = data_test.drop(columns=['Store', 'Assortment', 'StateHoliday'])
data_test.dropna()

In [28]:
missing_report(data_test, pd)

Unnamed: 0,Null (total),Null (percent),Type
CompetitionDistance,0,0.0,float64
CompetitionOpenSinceMonth,0,0.0,float64
CompetitionOpenSinceYear,0,0.0,float64
Promo2,0,0.0,int64
Promo2SinceWeek,0,0.0,float64
Promo2SinceYear,0,0.0,float64
PromoInterval,0,0.0,object
Date,0,0.0,datetime64[ns]
DayOfWeek,0,0.0,int64
Sales,0,0.0,float64


# 5. Feature engineering
## 5.1 'CompetitionSincePeriod'

In [None]:
# CompetitionOpenSince[Month / Year]
nonzero_sales['CompetitionOpenSincePeriod'] = (12 * (nonzero_sales['Year'] -
                                                     nonzero_sales['CompetitionOpenSinceYear']
                                                    )
                                              ) + (nonzero_sales['Month'] - 
                                                   nonzero_sales['CompetitionOpenSinceMonth']
                                                  )

## 5.2 'Promo2SincePeriod'

In [None]:
# Promo2Since[Week / Year]
nonzero_sales['Promo2SincePeriod'] = (52 * (nonzero_sales['Year'] - 
                                            nonzero_sales['Promo2SinceYear']
                                           ) + (nonzero_sales['WeekOfYear'] - 
                                                nonzero_sales['Promo2SinceWeek']
                                               )
                                     )

## 5.3 'PromoInterval'

In [None]:
# PromoInterval
nonzero_sales = pd.get_dummies(nonzero_sales, 
                               columns=['PromoInterval'], 
                               drop_first=True)

## 5.4 Drop the encoded columns

In [None]:
drop_encoded_cols = ['Date', 
                     'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
                     'Promo2SinceWeek', 'Promo2SinceYear']

In [None]:
final_df = nonzero_sales.drop(drop_encoded_cols, axis=1)

# Train test split

In [None]:
X = final_df.drop('Sales', axis=1)
y = final_df[['Sales']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Model

In [None]:
rf = RandomForestRegressor(n_estimators=100, 
                           min_samples_leaf=3, 
                           max_depth=5, 
                           random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [None]:
y_pred = rf.predict(X_test)

rf_base_metric = metric(y_pred, y_test.values)

print("Metric for baseline prediction = {}".format(rf_base_metric))