In [210]:
### import libraries
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings
import pickle


# Data Loading

In [211]:
# show the dataset structure
df=pd.read_csv("./dataset/TRAIN.csv")
df.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [212]:
df.tail()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
188335,T1188336,149,S2,L3,R2,2019-05-31,1,Yes,51,37272.0
188336,T1188337,153,S4,L2,R1,2019-05-31,1,No,90,54572.64
188337,T1188338,154,S1,L3,R2,2019-05-31,1,No,56,31624.56
188338,T1188339,155,S3,L1,R2,2019-05-31,1,Yes,70,49162.41
188339,T1188340,152,S2,L1,R1,2019-05-31,1,No,47,37977.0


In [213]:
# show the dataset columns
df.columns

Index(['ID', 'Store_id', 'Store_Type', 'Location_Type', 'Region_Code', 'Date',
       'Holiday', 'Discount', '#Order', 'Sales'],
      dtype='object')

In [214]:
# reformat the column names
df.columns=df.columns.str.lstrip("#").str.lower()
df.head(2)

Unnamed: 0,id,store_id,store_type,location_type,region_code,date,holiday,discount,order,sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12


In [215]:
# remove the id column
df.drop("id",axis=1,inplace=True)
df.head(2)

Unnamed: 0,store_id,store_type,location_type,region_code,date,holiday,discount,order,sales
0,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12


In [216]:
col=["store_type","location_type","region_code","holiday","discount"]

for i in col:
    print(df[i].value_counts())
    print("*"*30)

S1    88752
S4    45924
S2    28896
S3    24768
Name: store_type, dtype: int64
******************************
L1    85140
L2    48504
L3    29928
L5    13932
L4    10836
Name: location_type, dtype: int64
******************************
R1    63984
R2    54180
R3    44376
R4    25800
Name: region_code, dtype: int64
******************************
0    163520
1     24820
Name: holiday, dtype: int64
******************************
No     104051
Yes     84289
Name: discount, dtype: int64
******************************


# Data Preprocessing

In [217]:
df.isna().sum()

store_id         0
store_type       0
location_type    0
region_code      0
date             0
holiday          0
discount         0
order            0
sales            0
dtype: int64

# Encoding

In [218]:
# transfer non numeric columns to numeric columns
dms=pd.get_dummies(df[["discount","region_code","location_type","store_type"]])
dms

Unnamed: 0,discount_No,discount_Yes,region_code_R1,region_code_R2,region_code_R3,region_code_R4,location_type_L1,location_type_L2,location_type_L3,location_type_L4,location_type_L5,store_type_S1,store_type_S2,store_type_S3,store_type_S4
0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0
1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1
2,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0
3,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188335,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0
188336,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1
188337,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0
188338,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0


In [219]:
df.drop(["discount","region_code","location_type","store_type"],
        axis=1,inplace=True)
df.head()

Unnamed: 0,store_id,date,holiday,order,sales
0,1,2018-01-01,1,9,7011.84
1,253,2018-01-01,1,60,51789.12
2,252,2018-01-01,1,42,36868.2
3,251,2018-01-01,1,23,19715.16
4,250,2018-01-01,1,62,45614.52


In [220]:
df=pd.concat([df,dms],axis=1)
df.head()

Unnamed: 0,store_id,date,holiday,order,sales,discount_No,discount_Yes,region_code_R1,region_code_R2,region_code_R3,region_code_R4,location_type_L1,location_type_L2,location_type_L3,location_type_L4,location_type_L5,store_type_S1,store_type_S2,store_type_S3,store_type_S4
0,1,2018-01-01,1,9,7011.84,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0
1,253,2018-01-01,1,60,51789.12,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1
2,252,2018-01-01,1,42,36868.2,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0
3,251,2018-01-01,1,23,19715.16,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0
4,250,2018-01-01,1,62,45614.52,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0


In [221]:
df.drop("discount_Yes",axis=1,inplace=True)
df.drop("date",axis=1,inplace=True)
df.drop("sales",axis=1,inplace=True)
df.head()

Unnamed: 0,store_id,holiday,order,discount_No,region_code_R1,region_code_R2,region_code_R3,region_code_R4,location_type_L1,location_type_L2,location_type_L3,location_type_L4,location_type_L5,store_type_S1,store_type_S2,store_type_S3,store_type_S4
0,1,1,9,0,1,0,0,0,0,0,1,0,0,1,0,0,0
1,253,1,60,0,1,0,0,0,0,1,0,0,0,0,0,0,1
2,252,1,42,0,1,0,0,0,0,1,0,0,0,0,0,1,0
3,251,1,23,0,1,0,0,0,0,0,1,0,0,0,1,0,0
4,250,1,62,0,0,0,0,1,0,0,1,0,0,0,1,0,0


# Modelling

In [222]:
X=df.drop("order",axis=1)
y=df["order"]

In [223]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [224]:
### Random Forest Regressor Fit
rf=RandomForestRegressor().fit(X_train,y_train)

In [225]:
accuary=rf.score(X_train,y_train)
print("Train Accuracy: ",accuary)

Train Accuracy:  0.7336066745404444


# Conclusion
We can see the Accuracy is over 70%. Let dump the model to use it later.

In [226]:
pickle.dump(rf,open('model.pkl','wb'))