# Big Mart Outlet Sales Prediction

## Build Machine Learning pipeline

In [1]:
# Import required libraries
import pandas as pd
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [2]:
# read the data set
data = pd.read_csv('train_v9rqX0R.csv')

# top rows of the data
data.head()

# seperate the independent and target variables
train_x = data.drop(columns=['Item_Outlet_Sales'])
train_y = data['Item_Outlet_Sales']

In [3]:
# Build custom encoder for the three selected categorical variables

from sklearn.base import BaseEstimator

class custom_encoder(BaseEstimator):
    ''' Perform column transformation on 3 selected categorical variables '''
    def __init__(self):
        pass
    
    def fit(self, documents, y=None):
        return self
    
    def transform(self, data_x):
        data_x['outlet_grocery_store'] = (data_x['Outlet_Type']=='Grocery Store')*1
        data_x['outlet_supermarket_3'] = (data_x['Outlet_Type']=='Supermarket Type3')*1
        data_x['outlet_identifier_OUT027'] = (data_x['Outlet_Identifier']=='OUT027')*1
        return data_x

In [4]:
# Data pre-processsing step
# 1) Drop unnecessary columns  
# 2) Impute the missing values in column Item_Weight by mean
# 3) Scale the data in the column Item_MRP

preprocess = ColumnTransformer(remainder='passthrough',
                               transformers=[('drop_columns', 'drop', ['Item_Identifier',
                                                                       'Outlet_Identifier',
                                                                       'Item_Fat_Content',
                                                                       'Item_Type',
                                                                       'Outlet_Identifier',
                                                                       'Outlet_Size',
                                                                       'Outlet_Location_Type',
                                                                       'Outlet_Type'
                                                                      ]),
                                             ('impute_item_weight', SimpleImputer(strategy='mean'), ['Item_Weight']),
                                             ('scale_data', StandardScaler(),['Item_MRP'])])

In [5]:
# Define the Pipeline
# 1) Encode selected categorical variables
# 2) Preprocess the data
# 3) Train a Random Forest Regressor model

# Build pipeline
print('***** Building Pipeline *****\n')
model_pipeline = Pipeline(steps=[('encode_cat_var', custom_encoder()), 
                                 ('preprocessing', preprocess),
                                 ('random_forest', RandomForestRegressor(max_depth=10,random_state=2))])

# Fit the pipeline with the train data
print('...Fitting the pipeline with the training data...\n')
model_pipeline.fit(train_x, train_y)

# predict target values on the training data
print('...Predict target on the train data...\n')
print('Predicted sales: ', model_pipeline.predict(train_x))

# read the test data
print('\n...Reading the test data...\n')
test_data = pd.read_csv('test_AbJTz2l.csv')

# predict target variables on the test data 
print('...Predict on the test data...\n')
print('Predicted sales: ', model_pipeline.predict(test_data))

***** Building Pipeline *****

...Fitting the pipeline with the training data...

...Predict target on the train data...

Predicted sales:  [4348.01437159  693.40276088 2323.35268531 ... 1300.83665376 1508.86795688
 1152.35475793]

...Reading the test data...

...Predict on the test data...

Predicted sales:  [1646.0548363  1305.4298012   658.42325097 ... 1848.60226305 3959.67937864
 1384.01466986]
