In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('supermarket_sales - Sheet1.csv')

In [3]:
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Invoice ID               1000 non-null   object 
 1   Branch                   1000 non-null   object 
 2   City                     1000 non-null   object 
 3   Customer type            1000 non-null   object 
 4   Gender                   1000 non-null   object 
 5   Product line             1000 non-null   object 
 6   Unit price               1000 non-null   float64
 7   Quantity                 1000 non-null   int64  
 8   Tax 5%                   1000 non-null   float64
 9   Total                    1000 non-null   float64
 10  Date                     1000 non-null   object 
 11  Time                     1000 non-null   object 
 12  Payment                  1000 non-null   object 
 13  cogs                     1000 non-null   float64
 14  gross margin percentage  

In [5]:
df.isnull().sum()

Invoice ID                 0
Branch                     0
City                       0
Customer type              0
Gender                     0
Product line               0
Unit price                 0
Quantity                   0
Tax 5%                     0
Total                      0
Date                       0
Time                       0
Payment                    0
cogs                       0
gross margin percentage    0
gross income               0
Rating                     0
dtype: int64

In [6]:
# The function preprocess_inputs takes a DataFrame, removes the 'Invoice ID' column, and separates the 'Rating' column as the target variable ('y') 
# and the remaining data as the feature set ('X'). It returns these two parts.

def preprocess_inputs(data):
    data = data.copy()
    
    # Drop ID column
    data = data.drop('Invoice ID', axis=1)
    
    # Split df into X and y
    y = data['Rating']
    x = data.drop('Rating', axis=1)
    
    return x, y

In [7]:
# The code processes input data, creating feature set (X) and target (y). It then splits them into 70% training and 30% testing sets, 
# ensuring shuffling for randomness and a reproducible split using random state (1).

x, y = preprocess_inputs(df)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.7, shuffle=True, random_state=1)

In [8]:
X_train

Unnamed: 0,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income
731,A,Yangon,Normal,Male,Health and beauty,56.00,3,8.4000,176.4000,2/28/2019,19:33,Ewallet,168.00,4.761905,8.4000
716,A,Yangon,Member,Female,Fashion accessories,71.46,7,25.0110,525.2310,3/28/2019,16:06,Ewallet,500.22,4.761905,25.0110
640,B,Mandalay,Member,Female,Food and beverages,98.79,3,14.8185,311.1885,2/23/2019,20:00,Ewallet,296.37,4.761905,14.8185
804,B,Mandalay,Member,Female,Electronic accessories,75.59,9,34.0155,714.3255,2/23/2019,11:12,Cash,680.31,4.761905,34.0155
737,C,Naypyitaw,Normal,Male,Electronic accessories,58.76,10,29.3800,616.9800,1/29/2019,14:26,Ewallet,587.60,4.761905,29.3800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,B,Mandalay,Normal,Male,Sports and travel,13.69,6,4.1070,86.2470,2/13/2019,13:59,Cash,82.14,4.761905,4.1070
72,B,Mandalay,Member,Female,Food and beverages,48.52,3,7.2780,152.8380,3/5/2019,18:17,Ewallet,145.56,4.761905,7.2780
908,A,Yangon,Member,Female,Food and beverages,79.54,2,7.9540,167.0340,3/27/2019,16:30,Ewallet,159.08,4.761905,7.9540
235,A,Yangon,Normal,Female,Sports and travel,93.14,2,9.3140,195.5940,1/20/2019,18:09,Ewallet,186.28,4.761905,9.3140


In [9]:
X_train.shape

(700, 15)

In [12]:
# Categorize our features

binary_features = [
    'Customer type',
    'Gender'
]

date_features = [
    'Date'
]

time_features = [
    'Time'
]

nominal_features = [
    'Branch',
    'City',
    'Product line',
    'Payment',
]

In [13]:
# The DateEncoder class is designed to preprocess date columns in a DataFrame. The fit method is a placeholder,
# and the transform method converts date columns into separate year, month, and day columns. 
#The original date column is then dropped. This process facilitates more structured time-based analysis.
class DateEncoder:
    
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        for column in X.columns:
            X[column] = pd.to_datetime(X[column])
            X[column + '_year'] = X[column].apply(lambda x: x.year)
            X[column + '_month'] = X[column].apply(lambda x: x.month)
            X[column + '_day'] = X[column].apply(lambda x: x.day)
            X = X.drop(column, axis=1)
        return X

In [16]:
# The TimeEncoder class handles time columns in a DataFrame. The fit method is a placeholder, 
# and the transform method converts time columns into separate hour and minute columns after converting to datetime. 
# The original time column is then dropped, enhancing analysis with time-based attributes.

class TimeEncoder:
    
    def fit(self, X, y):
        return self
        
    def transform(self, X):
        for column in X.columns:
            X[column] = pd.to_datetime(X[column])
            X[column + '_hour'] = X[column].apply(lambda x: x.hour)
            X[column + '_minute'] = X[column].apply(lambda x: x.minute)
            X = X.drop(column, axis=1)
        return X

In [17]:
# Construct transformer pipelines for each feature type

binary_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

date_transformer = Pipeline(steps=[
    ('date', DateEncoder())
])

time_transformer = Pipeline(steps=[
    ('time', TimeEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

In [18]:
# Combine transformers with ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('binary', binary_transformer, binary_features),
    ('date', date_transformer, date_features),
    ('time', time_transformer, time_features),
    ('nomnal', nominal_transformer, nominal_features),
])

In [19]:
# Training

In [20]:
# Define models

models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

In [21]:
# Make a scaler

scaler = StandardScaler()

for name, model in models.items():
    # Construct the final pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', scaler),
        ('regressor', model)
    ])
    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


In [22]:
# Result 

In [23]:
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', scaler),
        ('regressor', model)
    ])
    print(name + " R^2 Score: {:.5f}".format(pipeline.score(X_test, y_test)))

                     Linear Regression R^2 Score: -0.01211
 Linear Regression (L2 Regularization) R^2 Score: -0.01449
 Linear Regression (L1 Regularization) R^2 Score: -0.00059
                   K-Nearest Neighbors R^2 Score: -0.13735
                        Neural Network R^2 Score: -0.08724
Support Vector Machine (Linear Kernel) R^2 Score: -0.04599
   Support Vector Machine (RBF Kernel) R^2 Score: -0.08280
                         Decision Tree R^2 Score: -0.93852
                         Random Forest R^2 Score: -0.04606
                     Gradient Boosting R^2 Score: -0.13559
                               XGBoost R^2 Score: -0.24937
                              LightGBM R^2 Score: -0.17964
                              CatBoost R^2 Score: -0.15741
