This code includes data split into train-test and building data transformation pipeline

### Splitting the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, PowerTransformer, OrdinalEncoder

In [2]:
data = pd.read_csv('../data/Train.csv', header=0)

# creating normal distributed Item_MRP to stratify the dataset
data['Item_MRP_cat'] = pd.cut(data['Item_MRP'], bins=[0., 50, 100, 175, 225, np.inf], labels=[1, 2, 3, 4, 5])

# splitting the dataset using Item_MRP_cat for stratification
strat_train_set, strat_test_set = train_test_split(data, test_size=0.2, stratify=data['Item_MRP_cat'], random_state=42)

for set_ in (strat_test_set, strat_train_set):
    set_.drop("Item_MRP_cat", axis=1, inplace=True)

#### Transformation Pipelines

In [3]:
sales = strat_train_set.drop('Item_Outlet_Sales', axis=1).copy()
sales_label = strat_train_set['Item_Outlet_Sales'].copy()

In [4]:
def item_fat_unified(X):
    return X.replace({"LF": 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'})


def item_fat_unified_name(function_transformer, feature_names_in):
    return ["unified"]


def make_lower_df(X: pd.DataFrame):
    return X.apply(lambda x: x.str.lower())


def make_lower(X):
    return X.str.lower()


def lower_name(function_transformer, feature_names_in):
    return ["lower"]

In [41]:
# Pipeline for fat content
fat_content_pipeline = make_pipeline(
        FunctionTransformer(item_fat_unified, feature_names_out=item_fat_unified_name),
        FunctionTransformer(make_lower, feature_names_out="one-to-one"),
        FunctionTransformer(lambda x: x.values.reshape(-1, 1), feature_names_out="one-to-one"),
        SimpleImputer(strategy="most_frequent"),
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2)        
    )
    

# Pipeline for Item_type which is similar to other categorical values but has OneHotEncoder instead of OrdinalEncoder
item_type_pipeline = make_pipeline(
    FunctionTransformer(make_lower, feature_names_out="one-to-one"),
    FunctionTransformer(lambda x: x.values.reshape(-1, 1), feature_names_out="one-to-one"),
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

# Pipeline for categorical values
cat_pipeline = make_pipeline(
    FunctionTransformer(make_lower_df, feature_names_out="one-to-one"),
    # FunctionTransformer(lambda x: x.values.reshape(-1, 1), feature_names_out="one-to-one"),
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="error")   
)

# Numeric pipeline for numeric values - starting with PowerTransformer
num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer(method="yeo-johnson", standardize=True)
)

# ---------------------------------------------------------------------
# Preprocessing - assembling all pipelines

num_attrs = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
cat_attrs = ['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", cat_pipeline, cat_attrs),
    ("fat_content", fat_content_pipeline, "Item_Fat_Content"),
    ("type", item_type_pipeline, "Item_Type")
],  verbose=True)



In [42]:
set_config(display='diagram')
preprocessing

In [36]:
sales_prepared = preprocessing.fit_transform(sales)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s


In [38]:
preprocessing.get_feature_names_out()

array(['num__Item_Weight', 'num__Item_Visibility', 'num__Item_MRP',
       'cat__Outlet_Size', 'cat__Outlet_Location_Type',
       'cat__Outlet_Type', 'fat_content__unified',
       'type__Item_Type_baking goods', 'type__Item_Type_breads',
       'type__Item_Type_breakfast', 'type__Item_Type_canned',
       'type__Item_Type_dairy', 'type__Item_Type_frozen foods',
       'type__Item_Type_fruits and vegetables',
       'type__Item_Type_hard drinks',
       'type__Item_Type_health and hygiene', 'type__Item_Type_household',
       'type__Item_Type_meat', 'type__Item_Type_others',
       'type__Item_Type_seafood', 'type__Item_Type_snack foods',
       'type__Item_Type_soft drinks', 'type__Item_Type_starchy foods'],
      dtype=object)

In [39]:
sales_prepared

array([[ 0.35687262, -1.74368555,  1.67496297, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.24286038, -0.64872073, -0.62221959, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.45720225, -0.55523282,  0.06034817, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.00502956,  0.80917873, -0.30509096, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.21795348, -0.92809678, -0.68819335, ...,  0.        ,
         0.        ,  0.        ],
       [-0.71017883,  1.07349505,  0.33907172, ...,  0.        ,
         0.        ,  0.        ]])

In [40]:
pd.DataFrame(sales_prepared, columns=preprocessing.get_feature_names_out())

Unnamed: 0,num__Item_Weight,num__Item_Visibility,num__Item_MRP,cat__Outlet_Size,cat__Outlet_Location_Type,cat__Outlet_Type,fat_content__unified,type__Item_Type_baking goods,type__Item_Type_breads,type__Item_Type_breakfast,...,type__Item_Type_fruits and vegetables,type__Item_Type_hard drinks,type__Item_Type_health and hygiene,type__Item_Type_household,type__Item_Type_meat,type__Item_Type_others,type__Item_Type_seafood,type__Item_Type_snack foods,type__Item_Type_soft drinks,type__Item_Type_starchy foods
0,0.356873,-1.743686,1.674963,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.242860,-0.648721,-0.622220,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.457202,-0.555233,0.060348,2.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.934606,-0.352522,0.447537,1.0,2.0,2.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.362545,0.863803,0.860035,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,-0.288325,-0.958216,0.200472,2.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6814,0.811950,0.950068,-0.575322,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6815,0.005030,0.809179,-0.305091,1.0,2.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6816,1.217953,-0.928097,-0.688193,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


... and now we can proceed with ML models

### ML bit

the models that Elza tried: 

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

a good read about linear models: https://scikit-learn.org/stable/modules/linear_model.html

In [43]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import svm

Decision tree

In [50]:
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(sales, sales_label)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s


In [51]:
tree_rms = -cross_val_score(tree_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [52]:
pd.Series(tree_rms).describe()

count      10.000000
mean     1517.726189
std        56.169530
min      1426.546717
25%      1484.857167
50%      1514.515757
75%      1559.699910
max      1610.456174
dtype: float64

Random forest

In [55]:
rand_forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
rand_forest_reg.fit(sales, sales_label)
forest_rms = -cross_val_score(rand_forest_reg, sales, sales_label, scoring="neg_root_mean_squared_error", cv=10)

[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[ColumnTransformer] ... (3 of 4) Processing fat_content, total=   0.0s
[ColumnTransformer] .......... (4 of 4) Processing type, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 4) Processing cat, total=   0.0s
[Colum

In [56]:
pd.Series(forest_rms).describe()

count      10.000000
mean     1142.078692
std        39.938682
min      1082.305662
25%      1119.175296
50%      1143.725370
75%      1163.281474
max      1220.931516
dtype: float64