### Model Investigation

- Linear Regressor (baseline)
- KNeighborsRegressor
- DecisionTreeRegressor
- RandomForestRegressor
- SDGRegressor

In [1]:
# Imports

from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from code_files.data_preperation import prepare_for_train
from code_files.train import train
import pandas as pd
import numpy as np

In [6]:
# Load Dataset
df_amazon = pd.read_csv("dataset/eda_amazon_sales_report.csv")
df_amazon.info()

df_amazon.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117123 entries, 0 to 117122
Data columns (total 24 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Unnamed: 0                           117123 non-null  int64  
 1   Size                                 117123 non-null  int64  
 2   Qty                                  117123 non-null  int64  
 3   Amount                               117123 non-null  float64
 4   promotion-ids                        117123 non-null  int64  
 5   B2B                                  117123 non-null  int64  
 6   Status_Cancelled                     117123 non-null  bool   
 7   Status_Shipped                       117123 non-null  bool   
 8   Status_Shipped - Delivered to Buyer  117123 non-null  bool   
 9   Fulfilment_Amazon                    117123 non-null  bool   
 10  Fulfilment_Merchant                  117123 non-null  bool   
 11  ship-service-

Index(['Unnamed: 0', 'Size', 'Qty', 'Amount', 'promotion-ids', 'B2B',
       'Status_Cancelled', 'Status_Shipped',
       'Status_Shipped - Delivered to Buyer', 'Fulfilment_Amazon',
       'Fulfilment_Merchant', 'ship-service-level_Expedited',
       'ship-service-level_Standard', 'Category_Blouse', 'Category_Bottom',
       'Category_Dupatta', 'Category_Ethnic Dress', 'Category_Saree',
       'Category_Set', 'Category_Top', 'Category_Western Dress',
       'Category_kurta', 'Month', 'Day'],
      dtype='object')

In [3]:
# CONSTANTS
MODELS = [
    LinearRegression(),
    Ridge(alpha=1.0, solver='auto', max_iter=1000, tol=1e-4),
    SGDRegressor(max_iter=500, tol=1e-4, learning_rate='invscaling', eta0=0.01),
    KNeighborsRegressor(n_neighbors=5, algorithm='auto', weights='distance'),
    DecisionTreeRegressor(max_depth=20, min_samples_split=5, min_samples_leaf=3),
    RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_split=5, min_samples_leaf=3)
]

In [5]:
from sklearn.model_selection import train_test_split
from code_files.data_preperation import prepare_for_train

dftrain, dfdev = train_test_split(df_amazon, test_size=0.1, random_state=42)
Xtrain, ytrain, Xdev, ydev = prepare_for_train(dftrain, dfdev)

results = []
for model in MODELS:
    print(f"Model: {model.__class__.__name__}")
    _, metrics = train(model, Xtrain, ytrain, Xdev, ydev)
    metrics.insert(0, model.__class__.__name__)
    results.append(metrics)

df_results = pd.DataFrame(results, columns = ["model", "mae", "rmse", "r2"])
df_results.to_csv("dataset/results/five-model-results.csv", index=False)
df_results



Model: LinearRegression
Model: Ridge
Model: SGDRegressor
Model: KNeighborsRegressor
Model: DecisionTreeRegressor
Model: RandomForestRegressor


Unnamed: 0,model,mae,rmse,r2
0,LinearRegression,214.817551,279.762721,279.762721
1,Ridge,214.817589,279.762692,279.762692
2,SGDRegressor,214.800095,279.966994,279.966994
3,KNeighborsRegressor,233.769973,304.018604,304.018604
4,DecisionTreeRegressor,227.217507,297.178812,297.178812
5,RandomForestRegressor,214.57327,280.303364,280.303364
