In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [2]:
df = pd.read_csv('../datasets/sales_data.csv')

In [3]:
df.head()

Unnamed: 0,OrderNumber,OrderDate,WarehouseCode,Sales Channel,StoreID,SalesTeamID,CustomerID,ProductID,Order_Quantity,Discount_Applied,Unit_Price,Unit_Cost
0,SO - 000101,5/31/2018,WARE-UHY1004,1,259,6,15,12,5,0.075,1963.1,1001.18
1,SO - 000102,5/31/2018,WARE-NMK1003,2,196,14,20,27,3,0.075,3939.6,3348.66
2,SO - 000103,5/31/2018,WARE-UHY1004,3,213,21,16,16,1,0.05,1775.5,781.22
3,SO - 000104,5/31/2018,WARE-NMK1003,4,107,28,48,23,8,0.075,2324.9,1464.69
4,SO - 000105,5/31/2018,WARE-NMK1003,3,111,22,49,26,8,0.1,1822.4,1476.14


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7991 entries, 0 to 7990
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   OrderNumber       7991 non-null   object 
 1   OrderDate         7991 non-null   object 
 2   WarehouseCode     7991 non-null   object 
 3   Sales Channel     7991 non-null   int64  
 4   StoreID           7991 non-null   int64  
 5   SalesTeamID       7991 non-null   int64  
 6   CustomerID        7991 non-null   int64  
 7   ProductID         7991 non-null   int64  
 8   Order_Quantity    7991 non-null   int64  
 9   Discount_Applied  7991 non-null   float64
 10  Unit_Price        7991 non-null   float64
 11  Unit_Cost         7991 non-null   float64
dtypes: float64(3), int64(6), object(3)
memory usage: 749.3+ KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sales Channel,7991.0,1.982856,1.015687,1.0,1.0,2.0,3.0,4.0
StoreID,7991.0,183.850081,105.903946,1.0,91.0,183.0,276.0,367.0
SalesTeamID,7991.0,14.384307,7.986086,1.0,8.0,14.0,21.0,28.0
CustomerID,7991.0,25.457014,14.414883,1.0,13.0,25.0,38.0,50.0
ProductID,7991.0,23.771743,13.526545,1.0,12.0,24.0,36.0,47.0
Order_Quantity,7991.0,4.525341,2.312631,1.0,3.0,5.0,7.0,8.0
Discount_Applied,7991.0,0.114394,0.08557,0.05,0.05,0.075,0.15,0.4
Unit_Price,7991.0,2284.536504,1673.096364,167.5,1031.8,1849.2,3611.3,6566.0
Unit_Cost,7991.0,1431.911513,1112.413063,68.68,606.12,1080.58,2040.25,5498.56


In [6]:
df.isnull().values.any()

False

In [7]:
df.columns

Index(['OrderNumber', 'OrderDate', 'WarehouseCode', 'Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID', 'Order_Quantity',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost'],
      dtype='object')

In [8]:
train_cols = ['Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost']

train_cols_few = ['Sales Channel', 'StoreID',
       'SalesTeamID', 'CustomerID', 'ProductID',
       'Discount_Applied', 'Unit_Price', 'Unit_Cost']

x = df[train_cols_few]
y = df['Order_Quantity']

In [23]:
x.head()

Unnamed: 0,Sales Channel,StoreID,SalesTeamID,CustomerID,ProductID,Discount_Applied,Unit_Price,Unit_Cost
0,1,259,6,15,12,0.075,1963.1,1001.18
1,2,196,14,20,27,0.075,3939.6,3348.66
2,3,213,21,16,16,0.05,1775.5,781.22
3,4,107,28,48,23,0.075,2324.9,1464.69
4,3,111,22,49,26,0.1,1822.4,1476.14


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [10]:
x_train_scale = StandardScaler().fit_transform(x_train)
x_test_scale = StandardScaler().fit_transform(x_test)

In [11]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 12, 1), 
    'gamma': hp.uniform('gamma', 1, 9), 
    'reg_alpha': hp.quniform('reg_alpha', 40, 100, 1), 
    'reg_lambda': hp.uniform('reg-lambda', 0, 1), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10,  1),
    'n_estimators': 30,
    'seed': 0,
    'use_label_encoder' : False
}

In [12]:
def objective(space): 
    clf = XGBRegressor(
        n_estimators = space['n_estimators'], 
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        reg_lambda = int(space['reg_lambda']),
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = int(space['colsample_bytree']),
        use_label_encoder= space['use_label_encoder']
    )
    
    evaluation = [(x_train_scale, y_train), (x_test_scale, y_test)]
    
    clf.fit(x_train_scale, y_train, eval_set=evaluation, eval_metric='auc', early_stopping_rounds=10, verbose=False)
    
    pred = clf.predict(x_test_scale)
    
    accuracy = accuracy_score(y_test, pred>0.5)
    
    print('Score: ', accuracy)
    
    return {'loss' : -accuracy, 'status' : STATUS_OK}

In [13]:
trials = Trials()

best_hyperparameters = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 10,
    trials = trials
)

Score:                                                
0.1305254378648874                                    
Score:                                                                           
0.1305254378648874                                                               
Score:                                                                           
0.1305254378648874                                                               
Score:                                                                           
0.1305254378648874                                                               
Score:                                                                           
0.1305254378648874                                                               
Score:                                                                           
0.1305254378648874                                                               
Score:                                                                

In [14]:
print(best_hyperparameters)

{'colsample_bytree': 0.8067542162741835, 'gamma': 2.027788825399706, 'max_depth': 5.0, 'min_child_weight': 10.0, 'reg-lambda': 0.4476512935712229, 'reg_alpha': 65.0}


In [17]:
xgb_md = XGBRegressor(
    colsample_bytree = 0.97,
    gamma = 4.19,
    max_depth = 4, 
    min_child_weight = 8,
    reg_lambda = 0.12,
    reg_alpha = 58,
    use_label_encoder=False,
    #objective='reg:logistic'
)

In [18]:
xgb_md.fit(x_train_scale, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.97, gamma=4.19, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=8, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=58, reg_lambda=0.12, scale_pos_weight=1, subsample=1,
             tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=None)

In [19]:
xg_preds = xgb_md.predict(x_test_scale)

In [20]:
xgb_rmse = np.sqrt(mean_squared_error(y_test, xg_preds))

In [21]:
xgb_rmse

2.34613263477218

In [29]:
single_instance  = [1, 259, 6, 15, 12, 0.075, 1963.1, 1001.18]

yhat = xgb_md.predict([single_instance])
# summarize prediction
print('Prediction: %d' % yhat)