This code includes data split into train-test and building data transformation pipeline

### Splitting the data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, PowerTransformer, OrdinalEncoder

In [3]:
data = pd.read_csv('../data/Train.csv', header=0)

# creating normal distributed Item_MRP to stratify the dataset
data['Item_MRP_cat'] = pd.cut(data['Item_MRP'], bins=[0., 50, 100, 175, 225, np.inf], labels=[1, 2, 3, 4, 5])

# splitting the dataset using Item_MRP_cat for stratification
strat_train_set, strat_test_set = train_test_split(data, test_size=0.2, stratify=data['Item_MRP_cat'], random_state=42)

for set_ in (strat_test_set, strat_train_set):
    set_.drop("Item_MRP_cat", axis=1, inplace=True)

#### Transformation Pipelines

In [4]:
sales = strat_train_set.drop('Item_Outlet_Sales', axis=1).copy()
sales_label = strat_train_set['Item_Outlet_Sales'].copy()

In [5]:
def item_fat_unified(X):
    return X.replace({"LF": 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'})

def make_lower(X):
    return X.str.lower()

def item_fat_unified_name(function_transformer, feature_names_in):
    return ["unified"]


def make_lower_df(X: pd.DataFrame):
    return X.apply(make_lower)


def reshape(X):
    return X.values.reshape(-1,1)

def lower_name(function_transformer, feature_names_in):
    return ["lower"]

In [6]:
# Pipeline for fat content
fat_content_pipeline = make_pipeline(
        FunctionTransformer(item_fat_unified, feature_names_out=item_fat_unified_name),
        FunctionTransformer(make_lower, feature_names_out="one-to-one"),
        FunctionTransformer(reshape, feature_names_out="one-to-one"),
        SimpleImputer(strategy="most_frequent"),
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2)        
    )
    

# Pipeline for Item_type which is similar to other categorical values but has OneHotEncoder instead of OrdinalEncoder
item_type_pipeline = make_pipeline(
    FunctionTransformer(make_lower, feature_names_out="one-to-one"),
    FunctionTransformer(reshape, feature_names_out="one-to-one"),
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

# Pipeline for categorical values
cat_pipeline = make_pipeline(
    FunctionTransformer(make_lower_df, feature_names_out="one-to-one"),
    # FunctionTransformer(lambda x: x.values.reshape(-1, 1), feature_names_out="one-to-one"),
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="error")   
)

# Numeric pipeline for numeric values - starting with PowerTransformer
num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer(method="yeo-johnson", standardize=True)
)

# ---------------------------------------------------------------------
# Preprocessing - assembling all pipelines

num_attrs = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
cat_attrs = ['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", cat_pipeline, cat_attrs),
    ("fat_content", fat_content_pipeline, "Item_Fat_Content"),
    ("type", item_type_pipeline, "Item_Type")
],  verbose=True)



In [7]:
set_config(display='diagram')
preprocessing

In [8]:
sales_prepared = preprocessing.fit_transform(sales)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s


In [9]:
preprocessing.get_feature_names_out()

array(['num__Item_Weight', 'num__Item_Visibility', 'num__Item_MRP',
       'cat__Outlet_Size', 'cat__Outlet_Location_Type',
       'cat__Outlet_Type', 'fat_content__unified',
       'type__Item_Type_baking goods', 'type__Item_Type_breads',
       'type__Item_Type_breakfast', 'type__Item_Type_canned',
       'type__Item_Type_dairy', 'type__Item_Type_frozen foods',
       'type__Item_Type_fruits and vegetables',
       'type__Item_Type_hard drinks',
       'type__Item_Type_health and hygiene', 'type__Item_Type_household',
       'type__Item_Type_meat', 'type__Item_Type_others',
       'type__Item_Type_seafood', 'type__Item_Type_snack foods',
       'type__Item_Type_soft drinks', 'type__Item_Type_starchy foods'],
      dtype=object)

In [10]:
sales_prepared

array([[ 0.35687262, -1.74368555,  1.67496297, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.24286038, -0.64872073, -0.62221959, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.45720225, -0.55523282,  0.06034817, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.00502956,  0.80917873, -0.30509096, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.21795348, -0.92809678, -0.68819335, ...,  0.        ,
         0.        ,  0.        ],
       [-0.71017883,  1.07349505,  0.33907172, ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
pd.DataFrame(sales_prepared, columns=preprocessing.get_feature_names_out())

Unnamed: 0,num__Item_Weight,num__Item_Visibility,num__Item_MRP,cat__Outlet_Size,cat__Outlet_Location_Type,cat__Outlet_Type,fat_content__unified,type__Item_Type_baking goods,type__Item_Type_breads,type__Item_Type_breakfast,...,type__Item_Type_fruits and vegetables,type__Item_Type_hard drinks,type__Item_Type_health and hygiene,type__Item_Type_household,type__Item_Type_meat,type__Item_Type_others,type__Item_Type_seafood,type__Item_Type_snack foods,type__Item_Type_soft drinks,type__Item_Type_starchy foods
0,0.356873,-1.743686,1.674963,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.242860,-0.648721,-0.622220,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.457202,-0.555233,0.060348,2.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.934606,-0.352522,0.447537,1.0,2.0,2.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.362545,0.863803,0.860035,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,-0.288325,-0.958216,0.200472,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6814,0.811950,0.950068,-0.575322,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6815,0.005030,0.809179,-0.305091,1.0,2.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6816,1.217953,-0.928097,-0.688193,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


... and now we can proceed with ML models

### ML bit

the models that Elza tried: 

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

a good read about linear models: https://scikit-learn.org/stable/modules/linear_model.html

In [12]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor

Decision tree

In [13]:
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(sales, sales_label)

tree_rms = -cross_val_score(tree_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(tree_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1517.726189
std        56.169530
min      1426.546717
25%      1484.857167
50%      1514.515757
75%      1559.699910
max      1610.456174
dtype: float64

Random forest

In [14]:
rand_forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
rand_forest_reg.fit(sales, sales_label)
forest_rms = -cross_val_score(rand_forest_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1142.078692
std        39.938682
min      1082.305662
25%      1119.175296
50%      1143.725370
75%      1163.281474
max      1220.931516
dtype: float64

the models that Elza tried: 

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

a good read about linear models: https://scikit-learn.org/stable/modules/linear_model.html

LinearRegression

In [15]:
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(sales, sales_label)
lin_reg_rms = -cross_val_score(lin_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1203.350700
std        34.906000
min      1149.164665
25%      1180.563255
50%      1203.039649
75%      1221.826129
max      1272.338989
dtype: float64

Ridge

In [16]:
ridge_reg = make_pipeline(preprocessing, Ridge(alpha=.5))
ridge_reg.fit(sales, sales_label)
ridge_reg_rms = -cross_val_score(ridge_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [17]:
pd.Series(ridge_reg_rms).describe()

count      10.000000
mean     1203.339912
std        34.889650
min      1149.187178
25%      1180.553079
50%      1203.043140
75%      1221.810386
max      1272.286941
dtype: float64

Lasso

In [18]:
lasso_reg = make_pipeline(preprocessing, Lasso(alpha=.5))
lasso_reg.fit(sales, sales_label)
lasso_reg_rms = -cross_val_score(lasso_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [19]:
pd.Series(lasso_reg_rms).describe()

count      10.000000
mean     1202.771961
std        34.434503
min      1149.383814
25%      1180.312273
50%      1202.722317
75%      1220.905102
max      1270.531965
dtype: float64

In [20]:
knn_reg = make_pipeline(preprocessing, KNeighborsRegressor(n_neighbors=5))
knn_reg.fit(sales, sales_label)
knn_reg_rms = -cross_val_score(knn_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [21]:
pd.Series(knn_reg_rms).describe()

count      10.000000
mean     1214.430983
std        38.252010
min      1148.651356
25%      1197.833222
50%      1211.753523
75%      1224.360272
max      1283.925251
dtype: float64

AdaBoostRegressor

In [22]:
adaboost_reg = make_pipeline(preprocessing, AdaBoostRegressor(random_state=42, n_estimators=100))
adaboost_reg.fit(sales, sales_label)
adaboost_reg_rms = -cross_val_score(adaboost_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [23]:
pd.Series(adaboost_reg_rms).describe()

count      10.000000
mean     1293.147273
std        71.483386
min      1199.572487
25%      1232.108073
50%      1303.096123
75%      1332.035659
max      1438.538379
dtype: float64

Other popular regressors

In [24]:
# Gradient Boosting Regression
from sklearn.ensemble import GradientBoostingRegressor
# Elastic Net Regression
from sklearn.linear_model import ElasticNet
# Support Vector Machine
from sklearn.svm import SVR
# Bayesian Ridge Regression
from sklearn.linear_model import BayesianRidge
# Kernel Ridge Regression
from sklearn.kernel_ridge import KernelRidge

In [25]:
gradboost_reg = make_pipeline(preprocessing, GradientBoostingRegressor(random_state=42))
gradboost_reg.fit(sales, sales_label)
gradboost_reg_rms = -cross_val_score(gradboost_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(gradboost_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1080.064614
std        37.895981
min      1026.860161
25%      1069.239320
50%      1077.076758
75%      1087.910014
max      1162.394483
dtype: float64

In [26]:
elastic_reg = make_pipeline(preprocessing, ElasticNet(random_state=42))
elastic_reg.fit(sales, sales_label)
elastic_reg_rms = -cross_val_score(elastic_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(elastic_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1297.784645
std        29.557505
min      1251.545134
25%      1284.034826
50%      1298.038233
75%      1316.556649
max      1347.963299
dtype: float64

In [27]:
svr_lin_reg = make_pipeline(preprocessing, SVR(kernel='linear'))
svr_lin_reg.fit(sales, sales_label)
svr_lin_reg_rms = -cross_val_score(svr_lin_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(svr_lin_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.3s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1276.884658
std        32.293120
min      1234.580842
25%      1253.212102
50%      1270.673991
75%      1295.227947
max      1338.736910
dtype: float64

In [28]:
svr_reg = make_pipeline(preprocessing, SVR(C=2.0, epsilon=0.1))
svr_reg.fit(sales, sales_label)
svr_reg_rms = -cross_val_score(svr_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(svr_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.2s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1546.313651
std        53.727312
min      1449.164712
25%      1510.029137
50%      1547.246196
75%      1587.824364
max      1629.156083
dtype: float64

In [29]:
svr_poly_reg = make_pipeline(preprocessing, SVR(kernel='poly'))
svr_poly_reg.fit(sales, sales_label)
svr_poly_reg_rms = -cross_val_score(svr_poly_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(svr_poly_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1512.569816
std        38.541237
min      1433.262273
25%      1490.776070
50%      1511.592704
75%      1545.247882
max      1556.296178
dtype: float64

Bayesian and Kernel Ridge Regression

In [30]:
bayes_ridge_reg = make_pipeline(preprocessing, BayesianRidge())
bayes_ridge_reg.fit(sales, sales_label)
bayes_ridge_reg_rms = -cross_val_score(bayes_ridge_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(bayes_ridge_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.2s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1203.127978
std        34.532172
min      1149.752499
25%      1180.312305
50%      1203.137575
75%      1221.452632
max      1271.238099
dtype: float64

In [31]:
kernel_ridge_reg = make_pipeline(preprocessing, KernelRidge())
kernel_ridge_reg.fit(sales, sales_label)
kernel_ridge_reg_rms = -cross_val_score(kernel_ridge_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(kernel_ridge_reg_rms).describe()

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.4s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.2s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.1s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.2s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.1s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.2s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

count      10.000000
mean     1203.494852
std        34.298082
min      1149.313824
25%      1181.723617
50%      1203.056190
75%      1221.985308
max      1271.057645
dtype: float64

Grid Search - Random

My 3 chosen models are GradientBoostingRegressor, RandomForest & Lasso

In [32]:
rforest_pipe = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42)),
])

grboost_pipe = Pipeline([
    ("preprocessing", preprocessing),
    ("grad_boost", GradientBoostingRegressor(random_state=42)),
])

lasso_pipe = Pipeline([
    ("preprocessing", preprocessing),
    ("lasso", Lasso(alpha=.5)),
])

In [33]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


for_param_distribs = {'random_forest__max_features': randint(low=2, high=20)}
                    #   'random_forest__n_estimators': randint(low=100, high=300)}

gboost_param_distribs = {'grad_boost__n_estimators': randint(low=100, high=200),
                         'grad_boost__max_depth': randint(low=3, high=10),
                         'grad_boost__max_features': randint(low=2, high=20)}

lasso_param_distribs = {'lasso__alpha': [0.1, 0.3, 0.5, 0.7, 0.9]}


forest_search = RandomizedSearchCV(
rforest_pipe, param_distributions=for_param_distribs, n_iter=10, cv=3,
scoring='neg_root_mean_squared_error', random_state=42)
forest_search.fit(sales, sales_label)

gboost_search = RandomizedSearchCV(
grboost_pipe, param_distributions=gboost_param_distribs, n_iter=10, cv=3,
scoring='neg_root_mean_squared_error', random_state=42)
gboost_search.fit(sales, sales_label)


lasso_search = RandomizedSearchCV(
lasso_pipe, param_distributions=lasso_param_distribs, n_iter=10, cv=3,
scoring='neg_root_mean_squared_error', random_state=42)
lasso_search.fit(sales, sales_label)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum



[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.1s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [34]:
forest_search.best_params_

{'random_forest__max_features': 8}

In [35]:
gboost_search.best_params_

{'grad_boost__max_depth': 3,
 'grad_boost__max_features': 11,
 'grad_boost__n_estimators': 191}

In [36]:
lasso_search.best_params_

{'lasso__alpha': 0.9}

In [37]:
sales_label.describe()  # trying to see the average outlet sales

count     6818.000000
mean      2188.773263
std       1703.980443
min         33.290000
25%        844.900200
50%       1805.649600
75%       3103.293800
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64

In [38]:
best_forest = forest_search.best_estimator_
best_gboost = gboost_search.best_estimator_
best_lasso = lasso_search.best_estimator_

Importances of the columns

In [39]:
sorted(zip(best_forest["random_forest"].feature_importances_,
best_forest["preprocessing"].get_feature_names_out()),
reverse=True)

[(0.4480306780049909, 'num__Item_MRP'),
 (0.21363380618961508, 'cat__Outlet_Type'),
 (0.11653861533696779, 'num__Item_Visibility'),
 (0.09085129273830339, 'num__Item_Weight'),
 (0.0216688629182189, 'cat__Outlet_Location_Type'),
 (0.018057774444530588, 'cat__Outlet_Size'),
 (0.011819209837098882, 'fat_content__unified'),
 (0.008410790343111743, 'type__Item_Type_snack foods'),
 (0.008267176132046448, 'type__Item_Type_fruits and vegetables'),
 (0.006954951705866209, 'type__Item_Type_household'),
 (0.006514849150545629, 'type__Item_Type_frozen foods'),
 (0.006452432747198064, 'type__Item_Type_dairy'),
 (0.005939216298875897, 'type__Item_Type_soft drinks'),
 (0.005575523680311986, 'type__Item_Type_baking goods'),
 (0.005277947358687956, 'type__Item_Type_canned'),
 (0.0048988760643632855, 'type__Item_Type_health and hygiene'),
 (0.0045891617814611835, 'type__Item_Type_meat'),
 (0.0036241095397367253, 'type__Item_Type_starchy foods'),
 (0.0033120169740651893, 'type__Item_Type_breads'),
 (0.00

In [40]:
X_test = strat_train_set.drop('Item_Outlet_Sales', axis=1).copy()
y_test = strat_train_set['Item_Outlet_Sales'].copy()


final_predictions = best_forest.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
print(final_rmse) 

final_predictions = best_gboost.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
print(final_rmse) 

final_predictions = best_lasso.predict(X_test)
final_rmse = mean_squared_error(y_test, final_predictions, squared=False)
print(final_rmse) 



421.1731833583672
1002.7883741943212
1198.762094854335




Random forest indeed is the best model here lol

In [41]:
import joblib
joblib.dump(best_forest, "model1_rforest.pkl")
joblib.dump(best_gboost, "model2_gboost.pkl")
joblib.dump(best_lasso, "model3_lasso.pkl")

['model3_lasso.pkl']