In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

import warnings 
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Business_sales.csv', sep=';')
data.head()

Unnamed: 0,Product ID,Product Position,Promotion,Product Category,Seasonal,Sales Volume,brand,url,name,description,price,currency,terms,section,season,material,origin
0,185102,Aisle,Yes,clothing,Yes,1243,Zara,https://www.zara.com/us/en/basic-puffer-jacket...,BASIC PUFFER JACKET,Puffer jacket made of tear-resistant ripstop f...,78.99,USD,jackets,MAN,Winter,Polyester,Brazil
1,188771,Aisle,Yes,clothing,No,1429,Zara,https://www.zara.com/us/en/tuxedo-jacket-p0889...,TUXEDO JACKET,Straight fit blazer. Pointed lapel collar and ...,14.99,USD,jackets,MAN,Autumn,Cotton,Turkey
2,180176,End-cap,Yes,clothing,Yes,1168,Zara,https://www.zara.com/us/en/slim-fit-suit-jacke...,SLIM FIT SUIT JACKET,Slim fit jacket. Notched lapel collar. Long sl...,71.95,USD,jackets,WOMAN,Autumn,Polyester,Morocco
3,112917,Aisle,Yes,clothing,No,1348,Zara,https://www.zara.com/us/en/stretch-suit-jacket...,STRETCH SUIT JACKET,Slim fit jacket made of viscose blend fabric. ...,30.99,USD,jackets,MAN,Spring,Polyester,China
4,192936,End-cap,Yes,clothing,Yes,1602,Zara,https://www.zara.com/us/en/double-faced-jacket...,DOUBLE FACED JACKET,Jacket made of faux leather faux shearling wit...,22.99,USD,jackets,WOMAN,Winter,Wool Blend,China


In [3]:
data = data.drop(columns=['Product ID', 'url', 'name', 'description', 'currency', 'Product Category', 'brand'], axis=1)

In [4]:
data

Unnamed: 0,Product Position,Promotion,Seasonal,Sales Volume,price,terms,section,season,material,origin
0,Aisle,Yes,Yes,1243,78.99,jackets,MAN,Winter,Polyester,Brazil
1,Aisle,Yes,No,1429,14.99,jackets,MAN,Autumn,Cotton,Turkey
2,End-cap,Yes,Yes,1168,71.95,jackets,WOMAN,Autumn,Polyester,Morocco
3,Aisle,Yes,No,1348,30.99,jackets,MAN,Spring,Polyester,China
4,End-cap,Yes,Yes,1602,22.99,jackets,WOMAN,Winter,Wool Blend,China
...,...,...,...,...,...,...,...,...,...,...
20247,Front of Store,Yes,No,1754,31.95,jeans,WOMAN,Summer,Linen Blend,India
20248,Aisle,No,No,872,49.99,jackets,WOMAN,Spring,Linen,China
20249,Aisle,Yes,No,1360,20.99,shoes,WOMAN,Spring,Polyester,China
20250,Aisle,No,No,892,64.95,jackets,WOMAN,Winter,Polyester,Spain


In [5]:
X = data.drop('price', axis=1)
y = data['price']

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [8]:
numeric_columns = X.select_dtypes(exclude='object').columns.tolist()
categorical_columns = X.select_dtypes(include='object').columns.tolist()

In [9]:
numeric_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('transformation', PowerTransformer(method='yeo-johnson'))
])

categorical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('OneHot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])


In [10]:
transformer = ColumnTransformer([
    ('NumericPipe', numeric_pipe, numeric_columns),
    ('CategoricalPipe', categorical_pipe, categorical_columns)
], remainder='passthrough')

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_validate

In [14]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Elastic Net': ElasticNet(),
    'SVR': SVR(),
    'KNeighbor': KNeighborsRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'XG Boost': XGBRegressor()
}

for name, model in models.items():
    print(f"\nModel: {name}")

    pipe = Pipeline([
        ('preprocess', transformer),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Regression metrics (correct)
    test_r2  = r2_score(y_test, y_pred)
    test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    test_mae  = mean_absolute_error(y_test, y_pred)

    print(f"R2 Score : {test_r2:.4f}")
    print(f"RMSE     : {test_rmse:.4f}")
    print(f"MAE      : {test_mae:.4f}")


Model: LinearRegression
R2 Score : 0.5408
RMSE     : 16.0954
MAE      : 12.8372

Model: Ridge
R2 Score : 0.5408
RMSE     : 16.0957
MAE      : 12.8365

Model: Lasso
R2 Score : 0.2594
RMSE     : 20.4409
MAE      : 16.0170

Model: Elastic Net
R2 Score : 0.1257
RMSE     : 22.2101
MAE      : 17.4077

Model: SVR
R2 Score : 0.4453
RMSE     : 17.6914
MAE      : 13.4213

Model: KNeighbor
R2 Score : 0.1776
RMSE     : 21.5414
MAE      : 16.5124

Model: RandomForestRegressor
R2 Score : 0.5302
RMSE     : 16.2808
MAE      : 12.6742

Model: GradientBoostingRegressor
R2 Score : 0.5574
RMSE     : 15.8026
MAE      : 12.4635

Model: HistGradientBoostingRegressor
R2 Score : 0.5798
RMSE     : 15.3968
MAE      : 12.1321

Model: DecisionTreeRegressor
R2 Score : 0.1390
RMSE     : 22.0412
MAE      : 16.7726

Model: XG Boost
R2 Score : 0.5486
RMSE     : 15.9596
MAE      : 12.5039


## Testing StandardScaler instead of Powertransformer

In [13]:
from sklearn.preprocessing import StandardScaler

numeric_pipe2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('StandardScaler', StandardScaler())
])

categorical_pipe2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('OneHot', OneHotEncoder(sparse_output=False, handle_unknown='error', drop='first'))
])

transform = ColumnTransformer([
    ('numeric_transform', numeric_pipe2, numeric_columns),
    ('categorical_tranform', categorical_pipe2, categorical_columns)
], remainder='passthrough')