In [2]:
# Importing liberaries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Importing ML liberaries 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor 
from catboost import CatBoostRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# ----
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
# Reading dataset
df = pd.read_csv("data/train.csv")

In [4]:
df.head(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
# Removing 'Item_Identifier' and 'Outlet_Identifier'
df.drop(columns=['Item_Identifier', 'Outlet_Identifier'], inplace=True)

In [6]:
df.head(5)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [7]:
df.shape

(8523, 10)

In [8]:
# Value counts before replacing value in "df['Item_Fat_Content'].value_counts()" category
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [9]:
# Replacing 'LF' and 'low fat' with 'Low Fat' and 'reg' with 'Regular' in "Item_Fat_Content" feature
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

In [10]:
# Value counts after replacing value in "df['Item_Fat_Content'].value_counts()" category
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [11]:
# Seperating dependent and independent features
# Independent or input features 
X = df.drop('Item_Outlet_Sales', axis=1)
# Dependent or output feature
y = df['Item_Outlet_Sales']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(X_train.shape)
print(y_train.shape)

(6818, 9)
(6818,)


In [14]:
numerical_columns = [ col for col in X_train.columns if X_train[col].dtype != 'O']
categorical_columns = [ col for col in X_train.columns if X_train[col].dtype == 'O']

print("We have {} numerical features: \n{} ".format(len(numerical_columns), numerical_columns))
print("\nWe have {} categorical features: \n{} ".format(len(categorical_columns), categorical_columns))

We have 4 numerical features: 
['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year'] 

We have 5 categorical features: 
['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'] 


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                7060 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Establishment_Year  8523 non-null   int64  
 6   Outlet_Size                6113 non-null   object 
 7   Outlet_Location_Type       8523 non-null   object 
 8   Outlet_Type                8523 non-null   object 
 9   Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 666.0+ KB


# Feature engineering without column transformer 

In [315]:
print("X_train Outlet_Size  null count: ",X_train['Outlet_Size'].isnull().sum())
print("X_test Outlet_Size  null count: ",X_test['Outlet_Size'].isnull().sum())
print("X_train Item_Weight  null count: ",X_train['Item_Weight'].isnull().sum())
print("X_test Item_Weight  null count: ",X_test['Item_Weight'].isnull().sum())

X_train Outlet_Size  null count:  1935
X_test Outlet_Size  null count:  475
X_train Item_Weight  null count:  1174
X_test Item_Weight  null count:  289


In [316]:
#############################   Step 1: Imputing missing values   #####################################
# Imputing(Handling missing values) 'Output_Size' feature with constant value = 'Missing'
# Output of simpleimputer is a numpy array
# SimpleImputer(strategy='mean') or SimpleImputer() both are same as by default strategy is 'mean'
imputer1 = SimpleImputer(strategy='constant', fill_value='Missing')
imputer2 = SimpleImputer(strategy='mean')

# Applying imputer1 on 'Output_Size' feature
imputer1.fit(X_train[['Outlet_Size']])
X_train_Outlet_Size = imputer1.transform(X_train[['Outlet_Size']])
X_test_Outlet_Size  = imputer1.transform(X_test[['Outlet_Size']])

# Applying imputer2 on 'Item_Weight' feature 
X_train_Item_Weight = imputer2.fit_transform(X_train[['Item_Weight']])
X_test_Item_Weight  = imputer2.transform(X_test[['Item_Weight']])

In [317]:
# Checking total null values after imputation
print(pd.DataFrame(X_train_Outlet_Size)[0].isnull().sum())
print(pd.DataFrame(X_test_Outlet_Size)[0].isnull().sum())
print(pd.DataFrame(X_train_Item_Weight)[0].isnull().sum())
print(pd.DataFrame(X_test_Item_Weight)[0].isnull().sum())

0
0
0
0


In [318]:
###################   Step 2: Encoding(ordinal_encoding, onehot_encoding)on categorical features  ##############################
# columns(Nominal category columns) to be onehot encode: ['Item_Type', 'Outlet_Location_Type', 'Outlet_Type']
# columns(Ordinal categorical columns) to be ordinal encode: ['Item_Fat_Content', 'Outlet_Size']

# Applying OneHot Encoding 
onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_onehot = onehot.fit_transform(X_train[['Item_Type', 'Outlet_Location_Type', 'Outlet_Type']])
X_test_onehot  = onehot.transform(X_test[['Item_Type', 'Outlet_Location_Type', 'Outlet_Type']])

# Applying Categorical Encoding
ordinal1 = OrdinalEncoder(categories=[['Low Fat', 'Regular']]) # OrdinalEncoder for 'Item_Fat_Content'
ordinal2 = OrdinalEncoder(categories=[['Missing', 'Small', 'Medium', 'High']]) # OrdinalEncoder for 'Outlet_Size'

X_train_ordinal_Item_Fat_Content = ordinal1.fit_transform(X_train[['Item_Fat_Content']])
X_test_ordinal_Item_Fat_Content  = ordinal1.transform(X_test[['Item_Fat_Content']])

X_train_ordinal_Outlet_Size = ordinal2.fit_transform(X_train_Outlet_Size)
X_test_ordinal_Outlet_Size  = ordinal2.transform(X_test_Outlet_Size)



In [319]:
###################   Step 3: Scaling(StandardScaler or MinMaxScaler or RobustScaler) numerical features  ####################
# Applying standard scaler on numerical column
stdscl1 =  StandardScaler()
stdscl2 =  StandardScaler()

X_train_scl1 = stdscl1.fit_transform(X_train[['Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']])
X_test_scl1  = stdscl1.transform(X_test[['Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']])

X_train_scl2 = stdscl2.fit_transform(X_train_Item_Weight)
X_test_scl2  = stdscl2.transform(X_test_Item_Weight)

In [320]:
X_train_array = np.concatenate((X_train_onehot,X_train_ordinal_Item_Fat_Content,X_train_ordinal_Outlet_Size,X_train_scl1,X_train_scl2), axis=1)
X_test_array = np.concatenate((X_test_onehot,X_test_ordinal_Item_Fat_Content,X_test_ordinal_Outlet_Size,X_test_scl1,X_test_scl2), axis=1)
pd.DataFrame(X_train_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,2.0,-0.600703,0.470709,0.136169,-0.801383
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-0.362159,0.457877,0.493521,1.210152
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.194933,-0.482625,-0.102066,1.115491
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-0.704944,-1.603553,0.493521,-1.079448
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.383177,0.218375,-0.102066,-0.008602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,4.282848,-0.043511,0.017052,-0.826231
6814,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.001006,-1.059078,1.089109,0.642189
6815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-0.916931,1.526207,0.493521,1.115491
6816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-0.228187,-0.383072,1.089109,1.766282


In [321]:
pd.DataFrame(X_test_array).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,3.0,-0.773887,-0.998908,-1.29324,0.3345425
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.097978,-1.586048,-0.102066,-1.172925
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,2.0,-0.481942,-1.596652,0.136169,0.3818727
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,2.0,-0.41478,0.508453,-1.531475,4.203769e-16
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,-1.043092,0.889079,0.731756,-0.6369105


# Feature engineering with column transformer and pipeline

In [322]:
# ############################# With ordinal encoding #########################################
# numerical_pipeline = Pipeline([
#     ('num_imputer', SimpleImputer(strategy='mean')),
#     ('num_scaler',  StandardScaler()), 
# ])

# categorical_pipeline1 = Pipeline([
#     ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Missing'))
# ])

# ct1 = ColumnTransformer([
#     ('numerical_pipeline', numerical_pipeline, numerical_columns),
#     ('categorical_pipeline1', categorical_pipeline1, categorical_columns)
# ], remainder='passthrough')

# X_train_ct1 = ct1.fit_transform(X_train)
# X_test_ct1  = ct1.transform(X_test)

# ct2 = ColumnTransformer([
#     ('cat_onehot',  OneHotEncoder(sparse=False, handle_unknown='ignore'), [5,7,8]),
#     ('cat_ordinal', OrdinalEncoder(categories=[['Low Fat', 'Regular'], ['Missing', 'Small', 'Medium', 'High']]), [4,6])
# ], remainder='passthrough')

# X_train_ready = ct2.fit_transform(X_train_ct1)
# X_test_ready  = ct2.transform(X_test_ct1)
# pd.DataFrame(X_train_ready).head()

# # At this stage our feature engineering is complete for X_train and X_test

In [323]:
############################# Without ordinal encoding #########################################
numerical_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler',  StandardScaler()), 
])

categorical_pipeline1 = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('cat_onehot',  OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

ct = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_columns),
    ('categorical_pipeline1', categorical_pipeline1, categorical_columns)
], remainder='passthrough')

X_train_ready = ct.fit_transform(X_train)
X_test_ready  = ct.transform(X_test)
pd.DataFrame(X_train_ready).head()

# At this stage our feature engineering is complete for X_train and X_test



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,-0.801383,-0.600703,0.470709,0.136169,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.210152,-0.362159,0.457877,0.493521,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.115491,0.194933,-0.482625,-0.102066,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.079448,-0.704944,-1.603553,0.493521,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.008602,1.383177,0.218375,-0.102066,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [324]:
# Create a function to evalute MAE, MSE and RMSE
def evalute_model(y_test, y_pred):
    mae  = mean_absolute_error(y_test, y_pred)
    mse  = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2_square = r2_score(y_test, y_pred)
    return mae, rmse, r2_square

In [325]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [327]:
model_list = []
r2_list = []

for i in range(len(models.values())):
    model_name = list(models.keys())[i]
    model = models[model_name]
    print(model_name)
    
    model.fit(X_train_ready, y_train)
    
    y_train_pred = model.predict(X_train_ready)    
    train_mae, train_rmse, train_r2_score = evalute_model(y_train, y_train_pred)
    
    y_test_pred = model.predict(X_test_ready)
    test_mae, test_rmse, test_r2_score = evalute_model(y_test, y_test_pred)
    
    print('Model performance for train set')
    print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- R2 Score: {:.4f}".format(train_r2_score))
    print("###############################################")
    print('Model performance for test set')
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- R2 Score: {:.4f}".format(test_r2_score))
    print("---------------------------- \n")
    model_list.append(model_name)
    r2_list.append(test_r2_score)    

Linear Regression
Model performance for train set
- Root Mean Squared Error: 1141.5314
- Mean Absolute Error: 847.2194
- R2 Score: 0.5595
###############################################
Model performance for test set
- Root Mean Squared Error: 1069.3680
- Mean Absolute Error: 792.0285
- R2 Score: 0.5793
---------------------------- 

Lasso
Model performance for train set
- Root Mean Squared Error: 1142.1169
- Mean Absolute Error: 846.9075
- R2 Score: 0.5590
###############################################
Model performance for test set
- Root Mean Squared Error: 1069.8316
- Mean Absolute Error: 792.1519
- R2 Score: 0.5789
---------------------------- 

Ridge
Model performance for train set
- Root Mean Squared Error: 1141.5452
- Mean Absolute Error: 847.1362
- R2 Score: 0.5595
###############################################
Model performance for test set
- Root Mean Squared Error: 1069.5970
- Mean Absolute Error: 792.1769
- R2 Score: 0.5791
---------------------------- 

K-Neighbors Regr

In [334]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2 score']).sort_values(by=['R2 score'], ascending=False)

Unnamed: 0,Model Name,R2 score
7,CatBoosting Regressor,0.588775
0,Linear Regression,0.579264
2,Ridge,0.579084
1,Lasso,0.578899
5,Random Forest Regressor,0.557831
6,XGBRegressor,0.543566
8,AdaBoost Regressor,0.506658
3,K-Neighbors Regressor,0.505828
4,Decision Tree,0.161211


In [None]:
# Plot y_pred and y_test
plt.scatter(y_test[0])