In [1]:
#building pipeline for all the above steps

#importing libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

#reading training dataset

data=pd.read_csv('SalesPrediction.csv')

data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [22]:
from sklearn.model_selection import train_test_split
X=data.drop(["Item_Outlet_Sales"],axis=1)
Y=data[["Item_Outlet_Sales"]]
train_x,test_x,train_y,test_y=train_test_split(X,Y,test_size=0.25)

In [59]:
x=train_x.head(1).values.tolist()[0]

In [57]:
y=train_x.head().columns.tolist()

In [61]:
d={}

In [67]:
for i in range(0,len(y)):
    d[y[i]]=[x[i]]

In [68]:
d

{'Item_Identifier': ['FDA58'],
 'Item_Weight': [9.395],
 'Item_Fat_Content': ['Low Fat'],
 'Item_Visibility': [0.103664897],
 'Item_Type': ['Snack Foods'],
 'Item_MRP': [233.6932],
 'Outlet_Identifier': ['OUT013'],
 'Outlet_Establishment_Year': [1987],
 'Outlet_Size': ['High'],
 'Outlet_Location_Type': ['Tier 3'],
 'Outlet_Type': ['Supermarket Type1'],
 'outlet_grocery_store': [0],
 'outlet_supermarket_3': [0],
 'outlet_identifier_OUT027': [0]}

In [69]:
x_in=pd.DataFrame(d)

In [25]:
#importing baseestimator

from sklearn.base import BaseEstimator

#creating class outlet identifier
#custom transformer must have fit and transform methods
class OutletTypeEncoder(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self,documents,y=None):
        return self
    
    def transform(self,x_dataset):
        x_dataset['outlet_grocery_store'] = (x_dataset['Outlet_Type'] == 'Grocery Store')*1
        x_dataset['outlet_supermarket_3'] = (x_dataset['Outlet_Type'] == 'Supermarket Type3')*1
        x_dataset['outlet_identifier_OUT027'] = (x_dataset['Outlet_Identifier'] == 'OUT027')*1
        return x_dataset


In [26]:
#preprocessing step
#dropping columns
#imputing the missing values
#Scaling the data if needed

pre_process=ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['Item_Identifier',
                                                                        'Outlet_Identifier',
                                                                        'Item_Fat_Content',
                                                                        'Item_Type',
                                                                        'Outlet_Identifier',
                                                                        'Outlet_Size',
                                                                        'Outlet_Location_Type',
                                                                        'Outlet_Type'
                                                                       ]),
                                              ('impute_item_weight', SimpleImputer(strategy='mean'), ['Item_Weight']),
                                              ('scale_data', StandardScaler(),['Item_MRP'])])

In [34]:
#Defining pipeline 

"""
Step1: get the updated binary columns
Step2: preprocessing
Step3: Training the model
"""
rf=Pipeline(steps=[('get_outlet_binary_columns', OutletTypeEncoder()), 
                                 ('pre_processing',pre_process),
                                 ('random_forest', RandomForestRegressor(max_depth=10,random_state=2))
                                 ])

#fit the pipeline with training data
rf.fit(train_x,train_y)

#predicting the training values
predictions=rf.predict(test_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_dataset['outlet_grocery_store'] = (x_dataset['Outlet_Type'] == 'Grocery Store')*1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_dataset['outlet_supermarket_3'] = (x_dataset['Outlet_Type'] == 'Supermarket Type3')*1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_dataset['outlet_identifier_OUT0

In [35]:
RMSE=mean_squared_error(test_y, predictions)**0.5

In [87]:
import joblib
import os

In [90]:
if not os.path.exists("saved_model"):
    os.makedirs("saved_model")
model_path=os.path.join("saved_model","model.joblib")

In [91]:
with open(model_path,"wb") as f:
    joblib.dump(rf,f)