In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import warnings
warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rcParams

rcParams['figure.figsize'] = 9, 4
rcParams['lines.linewidth'] = 3
rcParams['xtick.labelsize'] = 'x-large'
rcParams['ytick.labelsize'] = 'x-large'

In [3]:
df_new = pd.read_csv('df_new.csv')
df_new.head()

Unnamed: 0.1,Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,...,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Flight,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_high,Product_importance_low,Product_importance_medium,Cost_of_the_Product_std,Weight_in_gms_std
0,0,1,D,Flight,4,2,177,3,low,0,...,1,0,1,0,0,0,1,0,-0.687866,-1.468816
1,1,2,F,Flight,4,5,216,2,low,1,...,0,1,1,0,0,0,1,0,0.122044,-0.334795
2,2,3,A,Flight,2,2,183,4,low,1,...,0,0,1,0,0,0,1,0,-0.563264,-0.159954
3,3,4,B,Flight,3,3,176,4,medium,1,...,0,0,1,0,0,0,0,1,-0.708633,-1.50305
4,4,5,C,Flight,2,2,184,3,medium,0,...,0,0,1,0,0,0,0,1,-0.542497,-0.704039


In [5]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9996 entries, 0 to 9995
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 9996 non-null   int64  
 1   ID                         9996 non-null   int64  
 2   Warehouse_block            9996 non-null   object 
 3   Mode_of_Shipment           9996 non-null   object 
 4   Customer_care_calls        9996 non-null   int64  
 5   Customer_rating            9996 non-null   int64  
 6   Cost_of_the_Product        9996 non-null   int64  
 7   Prior_purchases            9996 non-null   int64  
 8   Product_importance         9996 non-null   object 
 9   Gender                     9996 non-null   int64  
 10  Discount_offered           9996 non-null   int64  
 11  Weight_in_gms              9996 non-null   int64  
 12  Reached.on.Time_Y.N        9996 non-null   int64  
 13  Log_Discount_offered       9996 non-null   float

# 1. Modeling

### A. Split Data Train & Test 

In [12]:
X = df_new.drop(columns=['Unnamed: 0','ID','Warehouse_block', 'Product_importance','Mode_of_Shipment'])
y = df_new[['Reached.on.Time_Y.N']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9996 entries, 0 to 9995
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer_care_calls        9996 non-null   int64  
 1   Customer_rating            9996 non-null   int64  
 2   Cost_of_the_Product        9996 non-null   int64  
 3   Prior_purchases            9996 non-null   int64  
 4   Gender                     9996 non-null   int64  
 5   Discount_offered           9996 non-null   int64  
 6   Weight_in_gms              9996 non-null   int64  
 7   Reached.on.Time_Y.N        9996 non-null   int64  
 8   Log_Discount_offered       9996 non-null   float64
 9   Warehouse_block_A          9996 non-null   int64  
 10  Warehouse_block_B          9996 non-null   int64  
 11  Warehouse_block_C          9996 non-null   int64  
 12  Warehouse_block_D          9996 non-null   int64  
 13  Warehouse_block_F          9996 non-null   int64

In [20]:
# Preprocessing Sample (Standardize)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

numerical_features = X.columns.to_list()
for n in numerical_features:
  scaler = ss.fit(X_train[[n]])
  X_train[n] = scaler.transform(X_train[[n]])
  X_test[n] = scaler.transform(X_test[[n]])

### B. Modeling

In [15]:
# Train linear regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [16]:
# Evaluate Linear Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
print('RMSE (test): ' + str(mean_squared_error(y_test, y_pred, squared=False)))
print('MAPE (test): ' + str(mean_absolute_percentage_error(y_test, y_pred)))
print('r2 (test): ' + str(r2_score(y_test, y_pred)))

RMSE (test): 1.2198259718524251e-15
MAPE (test): 1.2397465821940652
r2 (test): 1.0


In [17]:
# Linear Regression Coefficient
regressor.coef_

array([[ 3.05237626e-17, -7.63278329e-17, -1.56815024e-02,
         4.57966998e-16,  4.60785923e-17,  6.59194921e-16,
         2.45567475e-03,  4.91008242e-01,  5.44703171e-16,
         2.21752922e-02,  2.21218946e-02,  2.18347850e-02,
         2.23412498e-02,  2.81393053e-02, -1.25299180e-01,
        -1.24846707e-01, -1.59155982e-01, -1.63093942e-01,
        -2.93413861e-01, -2.90205615e-01,  1.56815024e-02,
        -2.45567475e-03]])

In [18]:
# Check overfit / underfit
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import cross_validate

def eval_regression(model):
  y_pred = model.predict(X_test)
  y_pred_train = model.predict(X_train)

  print('RMSE (test): ' + str(mean_squared_error(y_test, y_pred, squared=False)))
  print('RMSE (train): ' + str(mean_squared_error(y_train, y_pred_train, squared=False)))

  print('MAPE (test): ' + str(mean_absolute_percentage_error(y_test, y_pred)))
  print('MAPE (train): ' + str(mean_absolute_percentage_error(y_train, y_pred_train)))

  print('r2 (test): ' + str(r2_score(y_test, y_pred)))
  print('r2 (train): ' + str(r2_score(y_train, y_pred_train)))

  # k-fold cross validation
  score = cross_validate(model, X, y, cv=5, scoring='r2', return_train_score=True)
  print('r2 (cross-val test): ' + str(score['test_score'].mean()))
  print('r2 (cross-val train): ' + str(score['train_score'].mean()))

eval_regression(regressor)

RMSE (test): 1.2198259718524251e-15
RMSE (train): 1.2169456998311465e-15
MAPE (test): 1.2397465821940652
MAPE (train): 1.2783335715306565
r2 (test): 1.0
r2 (train): 1.0
r2 (cross-val test): 0.8
r2 (cross-val train): 1.0


In [19]:
# Regularization: Ridge Regression
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)
eval_regression(ridge)

RMSE (test): 7.780986669849581e-05
RMSE (train): 7.802291691917154e-05
MAPE (test): 151189051186.57352
MAPE (train): 155148904254.9754
r2 (test): 0.999999974675564
r2 (train): 0.9999999747496849
r2 (cross-val test): 0.7999996344887567
r2 (cross-val train): 0.9999996555781856


In [21]:
# Hyperparameter Tuning - Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

alpha = [0.001, 0.01]
solver = ['auto', 'cholesky', 'lsqr']
hyperparameters = dict(alpha=alpha, solver=solver)

from sklearn.linear_model import Ridge
ridge = Ridge()
rs = RandomizedSearchCV(ridge, hyperparameters, scoring='r2', cv=5, random_state=42)
rs.fit(X_train, y_train)
eval_regression(rs)

RMSE (test): 7.782523534550936e-08
RMSE (train): 7.803821340772751e-08
MAPE (test): 151215922.35095033
MAPE (train): 155176262.46176934
r2 (test): 0.9999999999999747
r2 (train): 0.9999999999999747
r2 (cross-val test): 0.799999999999634
r2 (cross-val train): 0.999999999999655


In [22]:
#Regularization - Lasso
from sklearn.linear_model import Lasso
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
eval_regression(lasso_model)

RMSE (test): 0.48905544670071543
RMSE (train): 0.49100824158919426
MAPE (test): 1058632065024837.9
MAPE (train): 1085768750787308.0
r2 (test): -0.0004298435434317849
r2 (train): 0.0
r2 (cross-val test): -0.016992732300568237
r2 (cross-val train): 0.1505874385762564


In [24]:
#Hyperparameter Tuning - Lasso
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

alpha = [0.02, 0.024, 0.025, 0.026, 0.03] # alpha or lambda
selection = ['cyclic', 'random']
hyperparameters = dict(alpha=alpha)

from sklearn.linear_model import Lasso
lasso_model = Lasso()
clf = RandomizedSearchCV(lasso_model, hyperparameters, cv=5, random_state=42, scoring='r2')

#Fitting Model
clf.fit(X_train, y_train)
eval_regression(clf)

RMSE (test): 0.019920457755161696
RMSE (train): 0.0199999999999989
MAPE (test): 43120745248528.43
MAPE (train): 44226090677136.23
r2 (test): 0.9983401491460037
r2 (train): 0.998340862315625
r2 (cross-val test): 0.7910990916599155
r2 (cross-val train): 0.9915072699974894


In [25]:
# Regularization - Elastic Net
from sklearn.linear_model import ElasticNet
elasticnet_model = ElasticNet()
elasticnet_model.fit(X_train, y_train)
eval_regression(elasticnet_model)

RMSE (test): 0.48905544670071543
RMSE (train): 0.49100824158919426
MAPE (test): 1058632065024837.9
MAPE (train): 1085768750787308.0
r2 (test): -0.0004298435434317849
r2 (train): 0.0
r2 (cross-val test): -0.00038714821620009496
r2 (cross-val train): 0.16649778672778764


In [26]:
# Hyperparameter Tuning - Elastic Net
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np

alpha = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]
l1_ratio = np.arange(0, 1, 0.01)
hyperparameters = dict(alpha=alpha, l1_ratio=l1_ratio, normalize=[True,False])

from sklearn.linear_model import ElasticNet
elasticnet_model = ElasticNet()
clf = RandomizedSearchCV(elasticnet_model, hyperparameters, cv=5, random_state=42, scoring='r2')
clf.fit(X_train, y_train)
eval_regression(clf)

RMSE (test): 4.524199729808756e-05
RMSE (train): 4.4070669629882024e-05
MAPE (test): 44435890009.11839
MAPE (train): 46102865020.9009
r2 (test): 0.9999999914384189
r2 (train): 0.9999999919439577
r2 (cross-val test): 0.7999999747272465
r2 (cross-val train): 0.9999999682968126


#### Non Linear Algorithm

In [27]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
eval_regression(dt)

RMSE (test): 0.0
RMSE (train): 0.0
MAPE (test): 0.0
MAPE (train): 0.0
r2 (test): 1.0
r2 (train): 1.0
r2 (cross-val test): 1.0
r2 (cross-val train): 1.0


In [28]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
eval_regression(rf)

RMSE (test): 0.0
RMSE (train): 0.0
MAPE (test): 0.0
MAPE (train): 0.0
r2 (test): 1.0
r2 (train): 1.0
r2 (cross-val test): 1.0
r2 (cross-val train): 1.0


In [None]:
#Support Vector Regressor
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
eval_regression(svr)

RMSE (test): 0.057234658712868536
RMSE (train): 0.05276555412155305
MAPE (test): 80119705810423.94
MAPE (train): 78557259135056.86
r2 (test): 0.9862978526546308
r2 (train): 0.9884515567936053
