In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
df = pd.read_csv('../datasets/sales_data.csv')

In [3]:
df.head()

Unnamed: 0,OrderNumber,OrderDate,WarehouseCode,Sales Channel,StoreID,SalesTeamID,CustomerID,ProductID,Order_Quantity,Discount_Applied,Unit_Price,Unit_Cost
0,SO - 000101,5/31/2018,WARE-UHY1004,1,259,6,15,12,5,0.075,1963.1,1001.18
1,SO - 000102,5/31/2018,WARE-NMK1003,2,196,14,20,27,3,0.075,3939.6,3348.66
2,SO - 000103,5/31/2018,WARE-UHY1004,3,213,21,16,16,1,0.05,1775.5,781.22
3,SO - 000104,5/31/2018,WARE-NMK1003,4,107,28,48,23,8,0.075,2324.9,1464.69
4,SO - 000105,5/31/2018,WARE-NMK1003,3,111,22,49,26,8,0.1,1822.4,1476.14


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7991 entries, 0 to 7990
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   OrderNumber       7991 non-null   object 
 1   OrderDate         7991 non-null   object 
 2   WarehouseCode     7991 non-null   object 
 3   Sales Channel     7991 non-null   int64  
 4   StoreID           7991 non-null   int64  
 5   SalesTeamID       7991 non-null   int64  
 6   CustomerID        7991 non-null   int64  
 7   ProductID         7991 non-null   int64  
 8   Order_Quantity    7991 non-null   int64  
 9   Discount_Applied  7991 non-null   float64
 10  Unit_Price        7991 non-null   float64
 11  Unit_Cost         7991 non-null   float64
dtypes: float64(3), int64(6), object(3)
memory usage: 749.3+ KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sales Channel,7991.0,1.982856,1.015687,1.0,1.0,2.0,3.0,4.0
StoreID,7991.0,183.850081,105.903946,1.0,91.0,183.0,276.0,367.0
SalesTeamID,7991.0,14.384307,7.986086,1.0,8.0,14.0,21.0,28.0
CustomerID,7991.0,25.457014,14.414883,1.0,13.0,25.0,38.0,50.0
ProductID,7991.0,23.771743,13.526545,1.0,12.0,24.0,36.0,47.0
Order_Quantity,7991.0,4.525341,2.312631,1.0,3.0,5.0,7.0,8.0
Discount_Applied,7991.0,0.114394,0.08557,0.05,0.05,0.075,0.15,0.4
Unit_Price,7991.0,2284.536504,1673.096364,167.5,1031.8,1849.2,3611.3,6566.0
Unit_Cost,7991.0,1431.911513,1112.413063,68.68,606.12,1080.58,2040.25,5498.56


In [6]:
df.isnull().values.any()

False

In [7]:
df.columns

Index(['OrderNumber', 'OrderDate', 'WarehouseCode', 'Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID', 'Order_Quantity',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost'],
      dtype='object')

In [8]:
train_cols = ['Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost']

train_cols_few = ['Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost']

x = df[train_cols_few]
y = df['Order_Quantity']

In [9]:
x

Unnamed: 0,Sales Channel,StoreID,SalesTeamID,CustomerID,ProductID,Discount_Applied,Unit_Price,Unit_Cost
0,1,259,6,15,12,0.075,1963.1,1001.18
1,2,196,14,20,27,0.075,3939.6,3348.66
2,3,213,21,16,16,0.050,1775.5,781.22
3,4,107,28,48,23,0.075,2324.9,1464.69
4,3,111,22,49,26,0.100,1822.4,1476.14
...,...,...,...,...,...,...,...,...
7986,1,339,9,41,29,0.075,234.5,121.94
7987,2,202,14,29,3,0.050,3202.6,1921.56
7988,2,241,14,32,35,0.200,3825.7,2792.76
7989,2,112,20,42,36,0.100,1072.0,804.00


In [10]:
y

0       5
1       3
2       1
3       8
4       8
       ..
7986    1
7987    6
7988    5
7989    8
7990    5
Name: Order_Quantity, Length: 7991, dtype: int64

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [12]:
# define the model
model = GradientBoostingRegressor()

In [13]:
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [14]:
# evaluate the model
n_scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [15]:
# report performance
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

MAE: -2.023 (0.029)
