# Importing Libraries and DB

In [1]:
# for dataset manipulation and cleaning
import pandas as pd

# seting, fiting, training, testing models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost
import pickle

#visual analization of variables and correlations
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
# db first impression

sales = pd.read_csv('Sales.csv')
sales.head()

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales
0,0,625,3,2013-11-06,641,1,1,0,0,7293
1,1,293,2,2013-07-16,877,1,1,0,1,7060
2,2,39,4,2014-01-23,561,1,1,0,0,4565
3,3,676,4,2013-09-26,1584,1,1,0,0,6380
4,4,709,3,2014-01-22,1477,1,1,0,0,11647


# Data Cleaning and Formatting

In [3]:
# check df types
sales.dtypes

True_index              int64
Store_ID                int64
Day_of_week             int64
Date                   object
Nb_customers_on_day     int64
Open                    int64
Promotion               int64
State_holiday          object
School_holiday          int64
Sales                   int64
dtype: object

In [4]:
# creating separate columns to the 'date' column and then dropping it

sales[['Year', 'Month', 'Day']] = sales['Date'].str.split('-', expand=True)

sales.drop(['Date'], axis = 1, inplace = True)

In [5]:
# also, column 'State_holiday' has 3 different holidays than may impact the sales somehow, depending, for example
# seasons, promotions, introductions of new products. won't do dummies for weekdays or sepecific month days because
# even though that may have some impact, that i agree, i don't believe that will make much of difference comapring
# to state holidays. of course, doing it this would safeguard that possibility but i opted not

dummy_variables = pd.get_dummies(sales['State_holiday'], prefix='State_holiday')
dummy_variables = dummy_variables.astype(int)
sales = pd.concat([sales, dummy_variables], axis=1)
sales.drop(['State_holiday'],axis = 1, inplace = True)
sales.head()

Unnamed: 0,True_index,Store_ID,Day_of_week,Nb_customers_on_day,Open,Promotion,School_holiday,Sales,Year,Month,Day,State_holiday_0,State_holiday_a,State_holiday_b,State_holiday_c
0,0,625,3,641,1,1,0,7293,2013,11,6,1,0,0,0
1,1,293,2,877,1,1,1,7060,2013,7,16,1,0,0,0
2,2,39,4,561,1,1,0,4565,2014,1,23,1,0,0,0
3,3,676,4,1584,1,1,0,6380,2013,9,26,1,0,0,0
4,4,709,3,1477,1,1,0,11647,2014,1,22,1,0,0,0


In [6]:
#rechecking
sales.dtypes

True_index              int64
Store_ID                int64
Day_of_week             int64
Nb_customers_on_day     int64
Open                    int64
Promotion               int64
School_holiday          int64
Sales                   int64
Year                   object
Month                  object
Day                    object
State_holiday_0         int64
State_holiday_a         int64
State_holiday_b         int64
State_holiday_c         int64
dtype: object

In [7]:
# true index doesn't not correspond to the actual value of the row and
# doesnt have any relevant value to use later we'll drop it

sales.drop(['True_index'], axis = 1, inplace = True)

sales.head()

Unnamed: 0,Store_ID,Day_of_week,Nb_customers_on_day,Open,Promotion,School_holiday,Sales,Year,Month,Day,State_holiday_0,State_holiday_a,State_holiday_b,State_holiday_c
0,625,3,641,1,1,0,7293,2013,11,6,1,0,0,0
1,293,2,877,1,1,1,7060,2013,7,16,1,0,0,0
2,39,4,561,1,1,0,4565,2014,1,23,1,0,0,0
3,676,4,1584,1,1,0,6380,2013,9,26,1,0,0,0
4,709,3,1477,1,1,0,11647,2014,1,22,1,0,0,0


In [8]:
#check the nulls
sales.isnull().sum()

Store_ID               0
Day_of_week            0
Nb_customers_on_day    0
Open                   0
Promotion              0
School_holiday         0
Sales                  0
Year                   0
Month                  0
Day                    0
State_holiday_0        0
State_holiday_a        0
State_holiday_b        0
State_holiday_c        0
dtype: int64

In [9]:
# since the number of customer per day is a continuous variable, we'll normalize it and drop the original column
normalizer = MinMaxScaler()

sales["Nb_customers_on_day_norm"] = normalizer.fit_transform(sales[["Nb_customers_on_day"]])
sales.drop(["Nb_customers_on_day"], axis=1, inplace = True)
sales.head()

Unnamed: 0,Store_ID,Day_of_week,Open,Promotion,School_holiday,Sales,Year,Month,Day,State_holiday_0,State_holiday_a,State_holiday_b,State_holiday_c,Nb_customers_on_day_norm
0,625,3,1,1,0,7293,2013,11,6,1,0,0,0,0.117442
1,293,2,1,1,1,7060,2013,7,16,1,0,0,0,0.160682
2,39,4,1,1,0,4565,2014,1,23,1,0,0,0,0.102785
3,676,4,1,1,0,6380,2013,9,26,1,0,0,0,0.290216
4,709,3,1,1,0,11647,2014,1,22,1,0,0,0,0.270612


In [15]:
# ill try a little thing: drop the year, day, month column to understand if the model improves. there are some days of the month
# where say might be higher but in the majority that doesnt occur, so probably it's better to drop it

features = sales.drop(['Sales'], axis = 1)
target = sales['Sales']

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

# Modelling and Testing Models

In [10]:
# ill try a little thing: drop the year, day, month column to understand if the model improves
# if so, drop the three
features = sales.drop(['Sales'], axis = 1)
target = sales['Sales']

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

In [11]:
## lets take a look at correlations to see if there any type of feature engineering we can do and refine our selection

In [13]:
# month, year and day look that they year too correlated and may not help the prediction model, so, i'll drop it
# and set a new test, test sets
sales.drop(['Year', 'Day', 'Month'], axis = 1, inplace = True)

features = sales.drop(['Sales'], axis = 1)
target = sales['Sales']

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 0)

In [24]:
forest = RandomForestRegressor(n_estimators = 175,
                               max_depth = 50,
                               random_state = 1)

forest.fit(x_train,y_train)

print("r**2: ", forest.score(x_test,y_test))

pred = forest.predict(x_test)
print("rmse: ", np.sqrt(mean_squared_error(pred, y_test)))

forest.score(x_train,y_train)

r**2:  0.9664848276622275


<IPython.core.display.Javascript object>

rmse:  705.585318881693


0.995078347757956

In [18]:
## lets try with a new model

xgbreg = xgboost.XGBRegressor()
xgbreg.fit(x_train, y_train)

print("r**2: ", xgbreg.score(x_test,y_test))
print("rmse: ", np.sqrt(mean_squared_error(pred, y_test)))

xgbreg.score(x_train,y_train)

r**2:  0.9289033613657375


<IPython.core.display.Javascript object>

rmse:  705.9158947336015


0.9316045065634283

In [19]:
# even though there's more variance in the first, it's bias is lower given the score being higher in the test score
# so, we decided for the forest

# Hyperparameters & Fine Tuning

In [20]:
# lets find the hyperparameters for the selected model: random forest

In [22]:
from sklearn.model_selection import GridSearchCV

grid = {'max_depth' : [50, 75, 100],
       'n_estimators' : [100,130,175]}

grid_search = GridSearchCV(estimator = forest, param_grid = grid, cv = 5)
grid_search.fit(x_train, y_train)

In [23]:
grid_search.best_params_

{'max_depth': 50, 'n_estimators': 175}

## Model Save

In [27]:
with open('salespred.pkl', 'wb') as file:
    pickle.dump(forest, file)

<IPython.core.display.Javascript object>