# IMPORTS

## IMPORTS

In [326]:
import pandas as pd
import warnings
from inflection import underscore
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from boruta import BorutaPy

## CONFIGURATION

In [292]:
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## HELPER FUNCTIONS

## LOADING DATA

In [261]:
df_store = pd.read_csv('./datasets/store.csv', low_memory=False)
df_train = pd.read_csv('./datasets/train.csv', low_memory=False)

In [262]:
df_raw = df_store.merge(df_train, how='left', on='Store')

#  DATA DESCRIPTION

## RENAME COLUMNS

In [263]:
old_cols = df_raw.columns
new_cols = [underscore(c) for c in old_cols]
df_raw.columns = new_cols

## DATA DIMENSIONS

In [264]:
print(f'The dataset has {df_raw.shape[0]} lines and {df_raw.shape[1]} columns')

The dataset has 1017209 lines and 18 columns


## CHECK TYPES

In [265]:
df_raw.dtypes

store                             int64
store_type                       object
assortment                       object
competition_distance            float64
competition_open_since_month    float64
competition_open_since_year     float64
promo2                            int64
promo2_since_week               float64
promo2_since_year               float64
promo_interval                   object
day_of_week                       int64
date                             object
sales                             int64
customers                         int64
open                              int64
promo                             int64
state_holiday                    object
school_holiday                    int64
dtype: object

## CHECK NA VALUES

In [266]:
df_raw.isna().sum()

store                                0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
dtype: int64

## FILL NA VALUES

In [267]:
df_raw['date'] = pd.to_datetime(df_raw['date'])

In [268]:
# competition_distance ( change for a value greater than the current maximum distance )
df_raw['competition_distance'] = df_raw['competition_distance'].apply(lambda x: 200000 if pd.isna(x) else x)

# competition_open_since_month ( if  na, receives the current month )
df_raw['competition_open_since_month'] = df_raw.apply(lambda x: x['date'].month if pd.isna(x['competition_open_since_month']) else x['competition_open_since_month'], axis = 1)

# competition_open_since_year ( if na, receives the current year )
df_raw['competition_open_since_year'] = df_raw.apply(lambda x: x['date'].year if pd.isna(x['competition_open_since_year']) else x['competition_open_since_year'], axis=1)
                                                     
# promo2_since_week ( if na, receives the current week )
df_raw['promo2_since_week'] = df_raw.apply(lambda x: x['date'].week if pd.isna(x['promo2_since_week']) else x['promo2_since_week'], axis=1)

# promo2_since_year ( if na, receives the current year )
df_raw['promo2_since_year'] = df_raw.apply(lambda x: x['date'].year if pd.isna(x['promo2_since_year']) else x['promo2_since_year'], axis=1)

# promo_interval (fill with zeros [other changes on feature engineering ])
df_raw['promo_interval'].fillna(0, inplace=True)

## CHANGE DTYPES

In [269]:
# competition_open_since_month
df_raw['competition_open_since_month'] = df_raw['competition_open_since_month'].astype('int64')

# competition_open_since_year
df_raw['competition_open_since_year'] = df_raw['competition_open_since_year'].astype('int64')

# promo2_since_week
df_raw['promo2_since_week'] = df_raw['promo2_since_week'].astype('int64')

# promo2_since_year
df_raw['promo2_since_year'] = df_raw['promo2_since_year'].astype('int64')

## DESCRIPTIVE STATISTICS

In [270]:
num_attributes = df_raw.select_dtypes(include = ['int64', 'float64'])
cat_attributes = df_raw.select_dtypes(exclude = ['int64', 'float64', 'datetime64[ns]'])

In [271]:
skew = pd.DataFrame(num_attributes.apply(lambda x: x.skew()), columns=['skew'])
kurtosis = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis()), columns=['kurtosis'])

In [272]:
df_description = pd.DataFrame(df_raw.describe()).T
df_description = pd.concat([df_description, skew, kurtosis], axis=1)
df_description

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,skew,kurtosis
store,1017209.0,558.43,321.91,1.0,280.0,558.0,838.0,1115.0,-0.0,-1.2
competition_distance,1017209.0,5935.44,12547.65,20.0,710.0,2330.0,6910.0,200000.0,10.24,147.79
competition_open_since_month,1017209.0,6.79,3.31,1.0,4.0,7.0,10.0,12.0,-0.04,-1.23
competition_open_since_year,1017209.0,2010.32,5.52,1900.0,2008.0,2012.0,2014.0,2015.0,-7.24,124.07
promo2,1017209.0,0.5,0.5,0.0,0.0,1.0,1.0,1.0,-0.0,-2.0
promo2_since_week,1017209.0,23.62,14.31,1.0,12.0,22.0,37.0,52.0,0.18,-1.18
promo2_since_year,1017209.0,2012.79,1.66,2009.0,2012.0,2013.0,2014.0,2015.0,-0.78,-0.21
day_of_week,1017209.0,4.0,2.0,1.0,2.0,4.0,6.0,7.0,0.0,-1.25
sales,1017209.0,5773.82,3849.93,0.0,3727.0,5744.0,7856.0,41551.0,0.64,1.78
customers,1017209.0,633.15,464.41,0.0,405.0,609.0,837.0,7388.0,1.6,7.09


# FEATURE ENGINEERING

## HYPOTESES

> <font color=green>__1. Lojas com maior sortimento vendem mais<br>
2. Lojas com competidores mais próximo vendem menos<br>
3. Lojas com competidores a mais tempo vendem mais<br>
4. Lojas vendem menos aos finais de semana<br>
5. Lojas que participam da promo2 vendem mais<br>
6. Lojas vendem mais no natal<br>
7. Lojas vendem menos nos feriados escolares<br>
8. Lojas em promoção a mais tempo vendem mais<br>
9. Lojas deveriam vender mais ao longo dos anos<br>
10. Lojas deveriam vender mais no segundo semestre do ano.<br>
11. Lojas deveriam vender mais depois do dia 10 de cada mês.__</font>

## FEATURE ENGINEERING

In [273]:
#Assortment
df_raw['assortment'] = df_raw['assortment'].apply(lambda x: 'basic' if x == 'a' else 'extended' if x == 'b' else 'extra')

#State Holiday
df_raw['state_holiday'] = df_raw['state_holiday'].apply(lambda x: 'public_holiday' if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c' else 'regular_day')

#month
df_raw['month'] = df_raw['date'].dt.month

#year
df_raw['year'] = df_raw['date'].dt.year

#day
df_raw['day'] = df_raw['date'].dt.day

#week of year
df_raw['week_of_year'] = df1['date'].dt.isocalendar().week

# is_promo
month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12:'Dec'}
df_raw['month_map'] = df_raw['date'].dt.month.map(month_map)
df_raw['is_promo'] = df_raw.apply(lambda x: 0 if x['promo_interval'] == 0 else 1 if str( x['month_map'] ) in str ( x['promo_interval'] ) else 0, axis=1)

# competition_since
df_raw['competition_since'] = df_raw.apply(lambda x: datetime(year = x['competition_open_since_year'], month = x['competition_open_since_month'], day = 1), axis=1)

# Months with competitor
df_raw['competition_time_month'] = ((df_raw['date'] - df_raw['competition_since']) / 30).apply(lambda x: x.days).astype('int64')
df_raw['competition_time_month'].head(30)

# promo_since
df_raw['promo_since'] = df_raw['promo2_since_year'].astype(str) + '-' + df_raw['promo2_since_week'].astype(str)
df_raw['promo_since'] = df_raw['promo_since'].apply(lambda x: datetime.strptime(x + '-1', '%Y-%W-%w') - timedelta(days = 7 ))
df_raw['promo_time_week'] = ((df_raw['date'] - df_raw['promo_since']) / 7).apply(lambda x: x.days).astype('int64')

# FEATURE FILTERING

## SELECTING ROWS

Customers will be excluded since we won't have the number of customers at the moment of prediction.
<br>
Lines in wich the column open is equal 0 will be deleted since we don't have sales when the store is closed

In [274]:
df_raw = df_raw[(df_raw['sales'] != 0) & (df_raw['open'] != 0)]

## SELECTING COLUMNS

In [275]:
df_raw.drop(columns=['open', 'customers', 'promo_interval', 'month_map'], inplace=True)

# EXPLORATORY DATA ANALYSIS

## UNIVARIATE ANALYSIS

### RESPONSE VARIABLE

### NUMERICAL VARIABLE

### CATEGORICAL VARIABLE

## BIVARIATE ANALYSIS

## MULTIVARIATE ANALYSIS

# DATA PREPARATION

## NORMALIZATION

## RESCALING

In [294]:
df1 = df_raw.copy()

In [295]:
scaler = MinMaxScaler()

df1['year'] = scaler.fit_transform(df1[['year']])

df1['promo_time_week'] = scaler.fit_transform(df1[['promo_time_week']])

In [296]:
r_scaler = RobustScaler()

df1['competition_distance'] = rscaler.fit_transform(df1[['competition_distance']])

df1['competition_time_month'] = rscaler.fit_transform(df1[['competition_time_month']])

## ENCONDING

In [297]:
#assortment
assortment_dict = {'basic': 0, 'extended': 1, 'extra': 2}
df1['assortment'] = df1['assortment'].map(assortment_dict)

#state holiday
df1 = pd.get_dummies(df1, prefix='holiday', columns=['state_holiday'])

#store type
le = LabelEncoder()
df1['store_type'] = le.fit_transform(df1[['store_type']])

## TRANSFORMATION

In [298]:
# month
df1['month_sin'] = df1['month'].apply(lambda x:  np.sin(x *( 2 * np.pi / 12)))
df1['month_cos'] = df1['month'].apply(lambda x: np.cos(x * ( 2* np.pi / 12)))
                                  
#day
df1['day_sin'] = df1['day'].apply(lambda x:  np.sin(x *( 2 * np.pi / 30)))
df1['day_cos'] = df1['day'].apply(lambda x: np.cos(x * ( 2* np.pi / 30)))

#week of year
df1['week_of_year_sin'] = df1['week_of_year'].apply(lambda x:  np.sin(x *( 2 * np.pi / 52)))
df1['week_of_year_cos'] = df1['week_of_year'].apply(lambda x: np.cos(x * ( 2* np.pi / 52)))
                                      
#day of week
df1['day_of_week_sin'] = df1['day_of_week'].apply(lambda x:  np.sin(x *( 2 * np.pi / 7)))
df1['day_of_week_cos'] = df1['day_of_week'].apply(lambda x: np.cos(x * ( 2* np.pi / 7)))

# FEATURE SELECTION

### SPLIT DATAFRAME INTO TRAINING AND TEST

In [302]:
cols_drop = ['day_of_week', 'week_of_year', 'day', 'month', 'promo_since', 'competition_since']
df1.drop(columns=cols_drop, inplace=True)

### BORUTA AS FEATURE SELECTOR

In [305]:
df1.head(1)

Unnamed: 0,store,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,date,sales,promo,school_holiday,year,is_promo,competition_time_month,promo_time_week,holiday_christmas,holiday_easter_holiday,holiday_public_holiday,holiday_regular_day,month_sin,month_cos,day_sin,day_cos,week_of_year_sin,week_of_year_cos,day_of_week_sin,day_of_week_cos
0,1,2,0,-0.17097,9,2008,0,31,2015,2015-07-31,5263,1,1,1.0,0,0.91892,0.28702,0,0,0,1,-0.5,-0.86603,0.20791,0.97815,-0.56806,-0.82298,-0.97493,-0.22252


In [318]:
X_train = df1[df1['date'] <= df1['date'].max() - timedelta(weeks=6)]
Y_train = X_train['sales']

X_test = df1[df1['date'] > df1['date'].max() - timedelta(weeks=6)]
Y_test = X_test['sales']

In [323]:
print(f"Train Min: {X_train['date'].min()}, Train Max: {X_train['date'].max()}")
print(f" Test Min: {X_test['date'].min()}, Test Max: {X_test['date'].max()}")

Train Min: 2013-01-01 00:00:00, Train Max: 2015-06-19 00:00:00
 Test Min: 2015-06-20 00:00:00, Test Max: 2015-07-31 00:00:00


In [328]:
X_train_n = X_train.drop(columns=['date', 'sales']).values
Y_train_n = Y_train.values.ravel()

rf = RandomForestRegressor(n_jobs=1)

boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42).fit(X_train_n, Y_train_n)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	17
Tentative: 	0
Rejected: 	10


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	17
Tentative: 	0
Rejected: 	10


# MACHINE LEARNING ( TRAINING MODEL )

# FINE TUNNING

# ERROR ANALYSIS