In [None]:
# Importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
# Creating connection to MySQL Database
connection = pymysql.connect(
    host = 'localhost',
    user = 'root',
    password = 'demo@123', # Use your own password here
    database = 'BigMart'
)

In [None]:
# Getting the data
df_item = pd.read_sql("SELECT * FROM item_info", connection)
df_item.head()

Unnamed: 0,ID,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP
0,1,FDA15,9.3,Low Fat,0.016047,Dairy,249.809
1,2,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692
2,3,FDN15,17.5,Low Fat,0.01676,Meat,141.618
3,4,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095
4,5,NCD19,8.93,Low Fat,0.0,Household,53.8614


In [None]:
# Getting the data
df_outlet = pd.read_sql("SELECT * FROM outlet_info", connection)
df_outlet.head()

Unnamed: 0,ID,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,2,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,3,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,4,OUT010,1998,Medium,Tier 3,Grocery Store
4,5,OUT013,1987,High,Tier 3,Supermarket Type1


In [None]:
# Getting the data
df_sales = pd.read_sql("SELECT * FROM sales_info", connection)
connection.close()
df_sales.head()

Unnamed: 0,ID,Item_Outlet_Sales
0,1,3735.14
1,2,443.423
2,3,2097.27
3,4,732.38
4,5,994.705


In [None]:
# Getting the data into a common dataframe
df = df_item.merge(df_outlet, on = 'ID').merge(df_sales, on = 'ID')
df = df.drop('ID', axis = 1)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.809,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.14
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.423
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.705


In [None]:
# Getting an idea about the duplicates in the data
df.duplicated().sum()

0

In [None]:
# 0 means that there are no duplicates in the data.

In [None]:
# Getting an idea about the shape of the data
print('Shape :', df.shape)
print('Number of rows :', df.shape[0])
print('Number of columns :', df.shape[1])

Shape : (8523, 12)
Number of rows : 8523
Number of columns : 12


In [None]:
# Getting an idea about NULL values
df.isnull().sum().any()

False

In [None]:
# False means that there are no NULL values in the dataset.

In [None]:
# Feature Engineering
df['Outlet_Age'] = 2025 - df['Outlet_Establishment_Year']
df = df.drop('Outlet_Establishment_Year', axis = 1)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.809,OUT049,Medium,Tier 1,Supermarket Type1,3735.14,26
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,443.423,16
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,Medium,Tier 1,Supermarket Type1,2097.27,26
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,Medium,Tier 3,Grocery Store,732.38,27
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,994.705,38


In [None]:
# Feature Engineering
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'low fat' : 'Low Fat',
    'LF' : 'Low Fat',
    'reg' : 'Regular'
})
df['Item_Fat_Content'].value_counts(normalize = True)

Item_Fat_Content
Low Fat    0.647307
Regular    0.352693
Name: proportion, dtype: float64

In [None]:
# Getting the range of 'Item_Visibility'
print('Maximum Value :', df['Item_Visibility'].max())
print('Minimum Value :', df['Item_Visibility'].min())

Maximum Value : 0.328391
Minimum Value : 0.0


In [None]:
# Capping the values
df['Item_Visibility'] = np.where(df['Item_Visibility'] > 0.3, 0.3, df['Item_Visibility'])
print('Maximum Value :', df['Item_Visibility'].max())
print('Minimum Value :', df['Item_Visibility'].min())

Maximum Value : 0.3
Minimum Value : 0.0


In [None]:
# Seperating the input features and target variable
X = df.drop('Item_Outlet_Sales', axis = 1)
y = df['Item_Outlet_Sales']

In [None]:
# Categorical Columns
categorical_cols = X.select_dtypes(include = 'object').columns.tolist()

In [None]:
# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(handle_unknown = 'ignore'), categorical_cols)
    ],
    remainder = 'passthrough'
)

In [None]:
# Defining the models
from sklearn.ensemble import RandomForestRegressor
models = {
    'GradientBoosting' : GradientBoostingRegressor(n_estimators = 200, learning_rate = 0.1, random_state = 42),
    'RandomForest' : RandomForestRegressor(n_estimators = 200, random_state = 42),
    'LinearRegression' : LinearRegression()
}

In [None]:
# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Evaluate all the models
best_model_name = None
best_score = -np.inf
best_pipeline = None

for name, reg in models.items():
    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', reg)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    r2 = np.round(metrics.r2_score(y_test, y_pred), 3)
    print(f"\n{name} Results :")
    print(f"R2 Score : {r2 : .3f}")

    if r2 > best_score:
        best_score = r2
        best_model_name = name
        best_pipeline = pipeline


GradientBoosting Results :
R2 Score :  0.593

RandomForest Results :
R2 Score :  0.563

LinearRegression Results :
R2 Score :  0.408


In [None]:
# Updating sklearn
!pip install --upgrade scikit-learn

In [None]:
# Saving the pickle file format of the best model
import sklearn
with open('bigmart_best_model.pkl', 'wb') as f:
    pickle.dump((best_pipeline, sklearn.__version__), f)

In [None]:
# Saving the pickle file format of the best model (alternative version)
import sklearn
with open('bigmart_best_model.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

In [None]:
'''
(1) Edit app.py file and replace the following line after using the alternative code ->
model, sklearn_version = pickle.load(f) -> model = pickle.load(f)

(2) Try to save the trained ML model using joblib rather than pickle file format.
'''