In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import requests
import boto3
import lmfit
import mlflow.sagemaker as mfs
import sklearn.linear_model as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Importing data 


In [13]:
leads = pd.read_csv("C:/Users/vemba/Desktop/leads_cleaned.csv")
leads.columns = map(str.lower, leads.columns)
leads.head()

Unnamed: 0,row_number,prospect id,lead number,lead origin,lead source,do not email,do not call,converted,totalvisits,total time spent on website,...,through recommendations,receive more updates about our courses,tags,lead quality,update me on supply chain content,get updates on dm content,city,i agree to pay the amount through cheque,a free copy of mastering the interview,last notable activity
0,0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,...,No,No,Interested in other courses,Low in Relevance,No,No,Mumbai,No,No,Modified
1,1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,...,No,No,Ringing,Not Sure,No,No,Mumbai,No,No,Email Opened
2,2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,...,No,No,Will revert after reading the email,Might be,No,No,Mumbai,No,Yes,Email Opened
3,3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,...,No,No,Ringing,Not Sure,No,No,Mumbai,No,No,Modified
4,4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,...,No,No,Will revert after reading the email,Might be,No,No,Mumbai,No,No,Modified


# Create data pre-processing steps before plugging into model

In [14]:
# Create data pre-processing steps before plugging into model
leads_categorical_columns = ['lead origin', 
                             'lead source',
                             'last activity',
                             'specialization',
                             'what is your current occupation',
                             'what matters most to you in choosing a course',
                             'city',
                             'last notable activity'
                              ]

leads_numerical_columns = ['totalvisits',
                           'total time spent on website',
                           'page views per visit']

leads_response_columns = ['converted']

# From here, we can create our train/test datasets that will be used for training:

In [15]:
# split data for training, remove extras
leads_x = leads.drop(leads_response_columns, axis=1)
leads_y = leads[leads_response_columns]

leads_x_train, leads_x_test, leads_y_train, leads_y_test = train_test_split(leads_x,
                                                                            leads_y,
                                                                            train_size=0.7,
                                                                            test_size=0.3,
                                                                            random_state=5050)

In [16]:
# Now that we have a test dataset, let’s go ahead and create a scaler for our numeric variables.
scaler = StandardScaler()
scaler = scaler.fit(leads_x_train[leads_numerical_columns])

# Now, we need to make some adjustments to the model to prepare for modeling. We’ve created a function to perform a few things:
Select the columns that we’ve defined as important
Use the fitted scaler to center and scale the numeric columns
Turn categorical variables into one-hot encoded variables
Ensure that all columns from the training dataset are also in the outputted, processed dataset (This is important so that all levels of dummy variables are created, even if the dataset we import doesn’t have each individual level.)

In [17]:

def pre_process_leads(df,
                      numerical_columns,
                      categorical_columns,
                      fitted_scaler,
                      train_df_columns = None):
    ## create new df with selected columns
    df.columns = map(str.lower, df.columns)
    _df = df[set(numerical_columns + categorical_columns)].copy()
    
    ## scale the numeric columns with the pre-built scaler
    _df[numerical_columns] = fitted_scaler.transform(_df[numerical_columns])
         
    # First, make categorical text lowercase
    _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
    # Next, create one-hot-encoded variables, add to dataframe, drop old columns
    _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)
    _df = pd.concat([_df, _df_dummies], axis=1)
    _df.drop(categorical_columns, axis=1, inplace = True)

    if train_df_columns:
        _df = _df.reindex(columns=train_df_columns, fill_value=0)

    return _df

# Here’s how it looks when we put it all together and run both the training and test dataset through our preprocessing function:

In [18]:
leads_x_train_clean = pre_process_leads(df = leads_x_train,
                                            numerical_columns = leads_numerical_columns,
                                            categorical_columns = leads_categorical_columns,
                                            fitted_scaler = scaler)

leads_x_test_clean = pre_process_leads(df = leads_x_test,
                                           numerical_columns = leads_numerical_columns,
                                           categorical_columns = leads_categorical_columns,
                                           fitted_scaler = scaler,
                                           train_df_columns = leads_x_train_clean.columns.tolist())


In [19]:
leads_x_train_clean

Unnamed: 0,page views per visit,totalvisits,total time spent on website,lead origin_landing page submission,lead origin_lead add form,lead origin_lead import,lead source_blog,lead source_click2call,lead source_direct traffic,lead source_facebook,...,last notable activity_form submitted on website,last notable activity_had a phone conversation,last notable activity_modified,last notable activity_olark chat conversation,last notable activity_page visited on website,last notable activity_resubscribed to emails,last notable activity_sms sent,last notable activity_unreachable,last notable activity_unsubscribed,last notable activity_view in browser link clicked
626,-1.078980,-0.685814,-0.883085,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2286,-0.165702,-0.290676,-0.122857,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1776,-0.394021,-0.093107,1.491250,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8336,-1.078980,-0.685814,-0.883085,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4663,-0.165702,0.104462,0.916488,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3153,-1.078980,-0.685814,-0.883085,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5491,-0.165702,-0.290676,2.106410,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5748,0.062618,0.302031,-0.816978,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5271,-0.015011,0.697168,-0.407484,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [20]:
leads_x_test_clean 

Unnamed: 0,page views per visit,totalvisits,total time spent on website,lead origin_landing page submission,lead origin_lead add form,lead origin_lead import,lead source_blog,lead source_click2call,lead source_direct traffic,lead source_facebook,...,last notable activity_form submitted on website,last notable activity_had a phone conversation,last notable activity_modified,last notable activity_olark chat conversation,last notable activity_page visited on website,last notable activity_resubscribed to emails,last notable activity_sms sent,last notable activity_unreachable,last notable activity_unsubscribed,last notable activity_view in browser link clicked
3208,0.747577,0.104462,-0.725164,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8819,-0.165702,-0.290676,-0.706801,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4854,-0.165702,-0.290676,0.334381,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7810,-1.078980,-0.685814,-0.883085,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3174,-1.078980,-0.685814,-0.883085,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,1.660856,0.499600,0.903634,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3816,0.290938,-0.093107,1.461869,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
492,0.747577,0.104462,-0.076950,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4178,0.747577,0.104462,1.059719,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#Training the model :Let’s use our newly cleaned and split datasets to train a random forest model that predicts the chances of someone converting into a paying customer of X Education. First, let’s define a few standard hyperparameters and initialize the SKLearn model:

In [21]:
## Train the random forest model
num_estimators = 100
min_samples = 4

rf = RandomForestClassifier (n_estimators=num_estimators,min_samples_split=min_samples)
rf.fit(leads_x_train_clean, leads_y_train.values.ravel())






RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

From here, we can quickly calculate a few accuracy metrics in our test set to see how the model d

In [22]:
leads_y_test_predicted = rf.predict(leads_x_test_clean)

accuracy = metrics.accuracy_score(leads_y_test, leads_y_test_predicted)
auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted)

print(accuracy)
print(auc_score)

0.8229893499816379
0.8033358734117294


In [23]:
# connect to MLflow use linux command in terminal before executing this cell-"mlflow ui"
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("MyLeadScoringProcessed")


# creates an experiment if it doesn't exist

In [24]:
remote_server_uri = "http://localhost:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)
# Note: on Databricks, the experiment name passed to mlflow_set_experiment must be a
# valid path in the workspace
mlflow.set_experiment("MyLeadScoringProcessed")
with mlflow.start_run():
    mlflow.log_param("a", 1)
    mlflow.log_metric("b", 2)

In [25]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("MyLeadScoringProcessed")

In [26]:
class leadsModel(mlflow.pyfunc.PythonModel):
   
    ## defining objects needed for leadsModel prediction. 
    def __init__(self,
                 train_df_columns,
                 model,
                 leads_categorical_columns,
                 leads_numerical_columns,
                 fitted_scaler,
                 pre_process_leads):
        
        ## Setting up all needed objects
        self.train_df_columns = train_df_columns
        self.model = model
        self.leads_categorical_columns = leads_categorical_columns
        self.leads_numerical_columns = leads_numerical_columns
        self.fitted_scaler = fitted_scaler
        self.pre_process_leads = pre_process_leads
    
    ## define function with processing and feeding data into prediction at the end
    def predict(self,context,model_input):
        
        # make sure all inputted columns are lowercase
        model_input.columns = map(str.lower, model_input.columns)
        
        # run inputted dataset through our processing function
        # note: we are excluding the response columns here since not needed for deploy
        model_input_processed = self.pre_process_leads(
                                   df = model_input,
                                   numerical_columns = self.leads_numerical_columns,
                                   categorical_columns = self.leads_categorical_columns,
                                   fitted_scaler = self.fitted_scaler,
                                   train_df_columns = self.train_df_columns)       
        
        # finally input the cleaned/adjusted dataset into our model for prediction
        return self.model.predict(model_input_adjusted)

Logging the model to MLflow
Before we package everything up and log the model, we need to setup the Anaconda environment that will be used when the model runs on Sagemaker.

In [27]:
# define specific python and package versions for environment
mlflow_conda_env = {
 'name': 'mlflow-env',
 'channels': ['defaults'],
 'dependencies': ['python=3.7.6', {'pip': ['mlflow==1.9.0','scikit-learn','cloudpickle==1.3.0']}]
}

# we start a run within MLflow. Within that run, we log our hyperparameters, accuracy metrics, and finally the model itself!

In [29]:

import os
from mlflow import log_metric, log_param, log_artifact
from mlflow import pyfunc as ml_pyfunc

# start mlflow run, log parameters, metrics, and the model
with mlflow.start_run(run_name="Leads Model with Processing") as run:
    # log the parameters that we defined for the model training
    mlflow.log_param("num_estimators", num_estimators)
    mlflow.log_param("min_samples", min_samples)
    
    # log the performance metrics that we calculated earlier
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc_score", auc_score)
    
    # log model with all objects referenced in the leadsModel class
    ml_pyfunc.log_model(
        artifact_path = "leads_pyfunc",
        python_model = leadsModel(train_df_columns = leads_x_train_clean.columns.tolist(),
                                  model = rf,
                                  leads_categorical_columns = leads_categorical_columns,
                                  leads_numerical_columns = leads_numerical_columns,
                                  fitted_scaler = scaler,
                                  pre_process_leads = pre_process_leads
                                 ),
        conda_env = mlflow_conda_env
    )
    
    # save run_id and experiment_id for deployment
    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    
    # end the mlflow run!
    mlflow.end_run()

# Sagemaker: Deploying the Model

In [70]:
## Note: this requires a MLflow pyfunc docker container to already exist in sagemaker

import mlflow.sagemaker as mfs


# we pull the run and experiment id's from above to create this mlflow location
#model_uri = "mlruns2/%s/%s/artifacts/leads_pyfunc" % (experiment_id,run_id)

# The region is chosen, pick whats close to you or your systems!
region = "ap-south-1"
# The aws account id can be found in the console
aws_account_id = "XXXXXXX"
# We use these inputs to automatically reference the sagemaker docker container
image_url = aws_account_id \
            + ".dkr.ecr." \
            + region \
            + ".amazonaws.com/mlflow-pyfunc:1.5.0"

# now we specify the role that we setup for sagemaker in the previous step
sagemaker_arn = "arn:aws:iam::873481834788:role/AWSGlueServiceSageMakerNotebookRole"


# finally, we pick a name for our endpoint within sagemaker
endpoint_name = "leads" 


# with all of the inputs, we run the following to deploy the model it sagemaker
mfs.deploy(app_name=endpoint_name, 
           model_uri=model_uri,
           region_name=region,
           mode="create", #this should change to replace if the endpoint already exists
           execution_role_arn=sagemaker_arn,
           image_url=image_url, 
           instance_type='ml.t2.medium') # smallest/cheapest sagemaker allowed size

OSError: No such file or directory: 'mlruns2\3\aa23c38c02ec4fe0939edc6128566046\artifacts\leads_pyfunc'