In [1]:
#building pipeline for all the above steps

#importing libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

#reading training dataset

data=pd.read_csv('SalesPrediction.csv')

data.head()

#separating dependent and independent columns
train_x=data.drop(columns=['Item_Outlet_Sales'])
train_y=data['Item_Outlet_Sales']

In [16]:
import mlflow
import mlflow.sklearn
mlflow.set_tracking_uri("http://10.42.204.118:8000")
mlflow.set_experiment("MLPipeline")

2022/01/01 21:34:22 INFO mlflow.tracking.fluent: Experiment with name 'MLPipeline' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/appadm/MLOps/MLFlowService/mlruns/11', experiment_id='11', lifecycle_stage='active', name='MLPipeline', tags={}>

In [5]:
#importing baseestimator

from sklearn.base import BaseEstimator

#creating class outlet identifier
#custom transformer must have fit and transform methods
class OutletTypeEncoder(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self,documents,y=None):
        return self
    
    def transform(self,x_dataset):
        x_dataset['outlet_grocery_store'] = (x_dataset['Outlet_Type'] == 'Grocery Store')*1
        x_dataset['outlet_supermarket_3'] = (x_dataset['Outlet_Type'] == 'Supermarket Type3')*1
        x_dataset['outlet_identifier_OUT027'] = (x_dataset['Outlet_Identifier'] == 'OUT027')*1
        return x_dataset


In [6]:
#preprocessing step
#dropping columns
#imputing the missing values
#Scaling the data if needed

pre_process=ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['Item_Identifier',
                                                                        'Outlet_Identifier',
                                                                        'Item_Fat_Content',
                                                                        'Item_Type',
                                                                        'Outlet_Identifier',
                                                                        'Outlet_Size',
                                                                        'Outlet_Location_Type',
                                                                        'Outlet_Type'
                                                                       ]),
                                              ('impute_item_weight', SimpleImputer(strategy='mean'), ['Item_Weight']),
                                              ('scale_data', StandardScaler(),['Item_MRP'])])

In [7]:
#Defining pipeline 

"""
Step1: get the updated binary columns
Step2: preprocessing
Step3: Training the model
"""
model_pipeline=Pipeline(steps=[('get_outlet_binary_columns', OutletTypeEncoder()), 
                                 ('pre_processing',pre_process),
                                 ('random_forest', RandomForestRegressor(max_depth=10,random_state=2))
                                 ])

#fit the pipeline with training data
model_pipeline.fit(train_x,train_y)

#predicting the training values
train_predict_pipeline=model_pipeline.predict(train_x)

In [17]:
RMSE=mean_squared_error(train_y, train_predict_pipeline)**(0.5)

In [18]:
mlflow.log_metric("RMSE",RMSE)

In [None]:
mlflow.sklearn.log_model(model_pipeline,"model",registered_model_name="randomforestmlpipeline")

In [1]:
import git

repo=git.Repo("D:/GitProjects/salesprediction")

In [4]:
repo.remote().fetch()

[<git.remote.FetchInfo at 0x139bfeba720>]

In [11]:
repo.git.add('--all')

''

In [12]:
repo.git.commit('-m',"commiting jupyter notebook")

'[master (root-commit) 32264d7] commiting jupyter notebook\n Committer: vishnunagineni <vishnu.nagineni@zensar.com>\nYour name and email address were configured automatically based\non your username and hostname. Please check that they are accurate.\nYou can suppress this message by setting them explicitly:\n\n    git config --global user.name "Your Name"\n    git config --global user.email you@example.com\n\nAfter doing this, you may fix the identity used for this commit with:\n\n    git commit --amend --reset-author\n\n 3 files changed, 8972 insertions(+)\n create mode 100644 .ipynb_checkpoints/ML Pipeline-checkpoint.ipynb\n create mode 100644 ML Pipeline.ipynb\n create mode 100644 SalesPrediction.csv'

In [13]:
origin=repo.remote(name='origin')

In [14]:
origin.push()

[<git.remote.PushInfo at 0x259435bae50>]

In [15]:
git.Git("D:/mlflow").clone("D:/Git/salesprediction.git")

''