# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.34.0


## Dataset

### Overview
The dataset contains all the data to predict the behavior to retain customers. Each row represents a customer. Each column contains the customer's attributes. The datasets include information about customers who left within the last month in a column called Churn; services that each customer has signed up for like phone, multiple lines, internet, online security, and others; information about the customer like how long they have been a customer, contract, payment method, and others; demographic information about customers like gender, age range, and if they have partners and dependents. This dataset contains about 7043 unique values and 21 columns.

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'telco-customer-churn'
experiment=Experiment(ws, experiment_name)

# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "Customer Churn"
description_text = "Customer Churn DataSet for Udacity Capstone Project"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        example_data = 'https://raw.githubusercontent.com/srees1988/predict-churn-py/main/customer_churn_data.csv'
        dataset = Dataset.Tabular.from_delimited_files(example_data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


Check the first five rows:

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,True,False,1,False,No phone service,DSL,No,...,No,No,No,No,Month-to-month,True,Electronic check,29.85,29.85,False
1,5575-GNVDE,Male,0,False,False,34,True,No,DSL,Yes,...,Yes,No,No,No,One year,False,Mailed check,56.95,1889.5,False
2,3668-QPYBK,Male,0,False,False,2,True,No,DSL,Yes,...,No,No,No,No,Month-to-month,True,Mailed check,53.85,108.15,True
3,7795-CFOCW,Male,0,False,False,45,False,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,False,Bank transfer (automatic),42.3,1840.75,False
4,9237-HQITU,Female,0,False,False,2,True,No,Fiber optic,No,...,No,No,No,No,Month-to-month,True,Electronic check,70.7,151.65,True


The column customerID should be removed because they have unique values in the whole column:

In [4]:
df['customerID'].nunique() == df.shape[0]

True

In [5]:
df.drop('customerID', axis=1, inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,True,False,1,False,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,True,Electronic check,29.85,29.85,False
1,Male,0,False,False,34,True,No,DSL,Yes,No,Yes,No,No,No,One year,False,Mailed check,56.95,1889.5,False
2,Male,0,False,False,2,True,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,True,Mailed check,53.85,108.15,True
3,Male,0,False,False,45,False,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,False,Bank transfer (automatic),42.3,1840.75,False
4,Female,0,False,False,2,True,No,Fiber optic,No,No,No,No,No,No,Month-to-month,True,Electronic check,70.7,151.65,True


Check the data types present along the columns:

In [6]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner                bool
Dependents             bool
tenure                int64
PhoneService           bool
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling       bool
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                  bool
dtype: object

Let's check the missing values:

In [7]:
# check missing values
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

Let's check the unique values of the float columns:

In [8]:
float_columns = df.select_dtypes(include=['float64']).columns
print(float_columns)

Index(['MonthlyCharges', 'TotalCharges'], dtype='object')


In [9]:
for column in float_columns:
    print(df[column].value_counts())
    print('\n')

20.05     61
19.85     45
19.95     44
19.90     44
20.00     43
          ..
114.75     1
103.60     1
113.40     1
57.65      1
113.30     1
Name: MonthlyCharges, Length: 1585, dtype: int64


20.20      11
19.75       9
19.65       8
20.05       8
19.90       8
           ..
1066.15     1
249.95      1
8333.95     1
7171.70     1
1024.00     1
Name: TotalCharges, Length: 6530, dtype: int64




Let's check the unique values of the bool columns:

In [10]:
bool_columns = df.select_dtypes(include=['bool']).columns
print(bool_columns)

Index(['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn'], dtype='object')


In [11]:
for column in bool_columns:
    print(df[column].value_counts())
    print('\n')

False    3641
True     3402
Name: Partner, dtype: int64


False    4933
True     2110
Name: Dependents, dtype: int64


True     6361
False     682
Name: PhoneService, dtype: int64


True     4171
False    2872
Name: PaperlessBilling, dtype: int64


False    5174
True     1869
Name: Churn, dtype: int64




In [12]:
5174/1864

2.7757510729613735

The variable Churn is highly skewed toward False by a factor of 2.77

Let's check the unique values of the object columns:

In [13]:
object_columns = df.select_dtypes(include=['object']).columns
print(object_columns)

Index(['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaymentMethod'],
      dtype='object')


In [14]:
for column in object_columns:
    print(df[column].value_counts())
    print('\n')

Male      3555
Female    3488
Name: gender, dtype: int64


No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64


Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64


No                     3498
Yes                    2019
No internet service    1526
Name: OnlineSecurity, dtype: int64


No                     3088
Yes                    2429
No internet service    1526
Name: OnlineBackup, dtype: int64


No                     3095
Yes                    2422
No internet service    1526
Name: DeviceProtection, dtype: int64


No                     3473
Yes                    2044
No internet service    1526
Name: TechSupport, dtype: int64


No                     2810
Yes                    2707
No internet service    1526
Name: StreamingTV, dtype: int64


No                     2785
Yes                    2732
No internet service    1526
Name: StreamingMovies, dtype: int64


Month-to-mo

## Train Test Splitting

In [15]:
from sklearn.model_selection import train_test_split
from azureml.data.dataset_factory import TabularDatasetFactory
description_text = "Train and Test splitting from Customer Churn DataSet for Udacity Capstone Project"

churn = df['Churn']

# Split data into train and test data taking into account the variable Churn is highly skewed:
train_dataset, test_dataset = train_test_split(df, test_size=0.2, stratify=churn, random_state=42)

directory = 'train'
if not os.path.exists(directory):
    os.makedirs(directory)
    
directory = 'test'
if not os.path.exists(directory):
    os.makedirs(directory)

# Export data as csv
train_dataset.to_csv("./train/train_data.csv", index=False)
test_dataset.to_csv("./test/test_data.csv", index=False)

# # Upload data to the datastore
datastore = ws.get_default_datastore()
datastore.upload(src_dir='./train', target_path = experiment_name)
datastore.upload(src_dir='./test', target_path = experiment_name)
print('Data uploaded to DataStore')

csv_path_train = [(datastore, experiment_name+'/train_data.csv')]
csv_path_test = [(datastore, experiment_name+'/test_data.csv')]

train_data = Dataset.Tabular.from_delimited_files(path=csv_path_train)
test_data = Dataset.Tabular.from_delimited_files(path=csv_path_test)

display(train_data.to_pandas_dataframe().head())
display(test_data.to_pandas_dataframe().head())

Uploading an estimated of 1 files
Uploading ./train/train_data.csv
Uploaded ./train/train_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Uploading an estimated of 1 files
Uploading ./test/test_data.csv
Uploaded ./test/test_data.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Data uploaded to DataStore


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,False,False,35,False,No phone service,DSL,No,No,Yes,No,Yes,Yes,Month-to-month,False,Electronic check,49.2,1701.65,False
1,Male,0,True,True,15,True,No,Fiber optic,Yes,No,No,No,No,No,Month-to-month,False,Mailed check,75.1,1151.55,False
2,Male,0,True,True,13,False,No phone service,DSL,Yes,Yes,No,Yes,No,No,Two year,False,Mailed check,40.55,590.35,False
3,Female,0,True,False,26,True,No,DSL,No,Yes,Yes,No,Yes,Yes,Two year,True,Credit card (automatic),73.5,1905.7,False
4,Male,0,True,True,1,True,No,DSL,No,No,No,No,No,No,Month-to-month,False,Electronic check,44.55,44.55,False


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,True,True,72,True,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,True,Credit card (automatic),114.05,8468.2,False
1,Female,1,False,False,8,True,Yes,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,True,Credit card (automatic),100.15,908.55,False
2,Female,0,True,True,41,True,Yes,DSL,Yes,Yes,Yes,No,Yes,No,One year,True,Credit card (automatic),78.35,3211.2,False
3,Male,0,True,False,18,True,No,Fiber optic,No,No,Yes,Yes,No,No,Month-to-month,False,Electronic check,78.2,1468.75,False
4,Female,0,True,False,72,True,Yes,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,True,Credit card (automatic),82.65,5919.35,False


## Cluster Provisioning


In [16]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###

cluster_name = "cluster-vhcg"
# verify that the cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name = cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2', max_nodes = 4, idle_seconds_before_scaledown=120)
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [17]:
# TODO: Put your automl config here
automl_config = AutoMLConfig(
                                compute_target=cpu_cluster,
                                task='classification',
                                training_data=train_data,
                                test_data = test_data,
                                label_column_name='Churn',
                                # n_cross_validations=10,
                                # validation_size=0.2,
                                primary_metric='AUC_weighted',
                                experiment_timeout_minutes=60,
                                max_concurrent_iterations=100,
                                max_cores_per_iteration=-1, 
                                featurization= 'auto',
                                debug_log = "automl_errors.log",                                

                                )

##  Create AutoML Pipeline

In [18]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

Create AutoMLStep

In [19]:
automl_step = AutoMLStep(
    name='automl_module',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [20]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(
    description="pipeline_with_automlstep",
    workspace=ws,    
    steps=[automl_step])

In [21]:
pipeline_run = experiment.submit(pipeline)

Created step automl_module [0918423a][e640bdd2-6c55-4ca7-9b74-54235d380c81], (This step will run and generate new outputs)
Submitted PipelineRun 39ab988f-adc7-4c77-a666-a017e0b72973
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/39ab988f-adc7-4c77-a666-a017e0b72973?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-165556/workspaces/quick-starts-ws-165556&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254


In [22]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [23]:
pipeline_run.wait_for_completion()

PipelineRunId: 39ab988f-adc7-4c77-a666-a017e0b72973
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/39ab988f-adc7-4c77-a666-a017e0b72973?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-165556/workspaces/quick-starts-ws-165556&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
PipelineRun Status: Running


StepRunId: ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab?wsid=/subscriptions/5a4ab2ba-6c51-4805-8155-58759ad589d8/resourcegroups/aml-quickstarts-165556/workspaces/quick-starts-ws-165556&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254
StepRun( automl_module ) Status: Running

StepRun(automl_module) Execution Summary
StepRun( automl_module ) Status: Finished

No scores improved over last 20 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK r

'Finished'

## Examine Results

Retrieve the metrics of all child runs

In [24]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab/metrics_data
Downloaded azureml/ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab/metrics_data, 1 files out of an estimated total of 1


In [25]:
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
df

Unnamed: 0,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_92,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_89,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_120,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_81,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_110,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_136,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_107,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_69,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_108,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_111,...,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_209,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_217,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_223,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_202,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_210,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_218,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_298,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_253,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_266,ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab_446
AUC_macro,[0.8502798848238635],[0.8283670232884376],[0.8428060032231685],[0.8445760205402975],[0.842986942530449],[0.8474367140363861],[0.8446156096307442],[0.8344319214051105],[0.8430123116889522],[0.8421376753352542],...,[0.8511639607646929],[0.8447375583149092],[0.8496122889738479],[0.8496614962137006],[0.8361862872679651],[0.8455664407178457],[0.8461863906293261],[0.8496843697946309],[0.8477891982176141],[0.8524444315414019]
average_precision_score_weighted,[0.8665853735526672],[0.8485691082046515],[0.8629733765017665],[0.8606147750573938],[0.8588760720731136],[0.8641115641128074],[0.8611617666820788],[0.8563468188036518],[0.8593264866031013],[0.8592431009187141],...,[0.8681805287279752],[0.8632376152954594],[0.8671219509918361],[0.8669272184846634],[0.857459114730808],[0.8615649044290543],[0.8649903862741303],[0.8658162041208931],[0.8643259844732413],[0.86907776546832]
precision_score_micro,[0.8040468583599574],[0.7900248491302805],[0.8020944266950657],[0.7958821441249556],[0.7962371317003903],[0.8001419950301739],[0.8022719204827831],[0.7907348242811502],[0.7463613773517928],[0.8006744763933261],...,[0.8056443024494143],[0.8006744763933261],[0.8070642527511538],[0.8084842030528931],[0.8006744763933261],[0.7985445509407171],[0.8020944266950657],[0.80386936457224],[0.8001419950301739],[0.7777777777777778]
weighted_accuracy,[0.8616514908748517],[0.8511829456772437],[0.8576750978998846],[0.8751222929162251],[0.8530669344408329],[0.8723536673253601],[0.8562071391664978],[0.8464704560099027],[0.7352896781359869],[0.8587889923868831],...,[0.8597441971640268],[0.8591123842885203],[0.8639041557388981],[0.8623450447995274],[0.8657951146652181],[0.858144104701133],[0.8597003961165913],[0.857196791640205],[0.8660605316945632],[0.8632894197318596]
recall_score_micro,[0.8040468583599574],[0.7900248491302805],[0.8020944266950657],[0.7958821441249556],[0.7962371317003903],[0.8001419950301739],[0.8022719204827831],[0.7907348242811502],[0.7463613773517928],[0.8006744763933261],...,[0.8056443024494143],[0.8006744763933261],[0.8070642527511538],[0.8084842030528931],[0.8006744763933261],[0.7985445509407171],[0.8020944266950657],[0.80386936457224],[0.8001419950301739],[0.7777777777777778]
average_precision_score_macro,[0.8033781080741891],[0.7771051828825062],[0.7985336415859455],[0.7939348059957655],[0.7917392553580158],[0.8003095326289896],[0.7952422592988527],[0.789256017139365],[0.7922125748552403],[0.7930159811363297],...,[0.8056084765585029],[0.7989991924289166],[0.8037543768862966],[0.8043634802881194],[0.7937243291043298],[0.795477662617218],[0.8013876532435115],[0.8022077890448847],[0.8006753069200644],[0.8067030539261008]
precision_score_macro,[0.7551913636994144],[0.7332539388654293],[0.7510314679542812],[0.7572455799946685],[0.7426034194623035],[0.7577123384188577],[0.7511429900353148],[0.733794360226904],[0.7121903292867691],[0.7489887587576526],...,[0.7560365512676689],[0.7508376086504138],[0.7594419711614382],[0.7605083390605696],[0.7526974174324402],[0.7479245109781241],[0.7518955923906746],[0.7530083985632059],[0.7534625630082395],[0.619409085449641]
matthews_correlation,[0.4669766282017848],[0.4255899792241395],[0.4645380081250042],[0.4201649106328626],[0.44829941478421814],[0.43911863772975557],[0.46761219160028444],[0.43597697889271986],[0.47308308110057823],[0.45697886463765464],...,[0.47549562893613473],[0.45777337322999007],[0.47550219274254596],[0.4830038314037279],[0.4480913989365654],[0.4508010886747645],[0.46185738075699484],[0.4719672644189244],[0.4463901087683073],[0.30883056202346054]
accuracy,[0.8040468583599574],[0.7900248491302805],[0.8020944266950657],[0.7958821441249556],[0.7962371317003903],[0.8001419950301739],[0.8022719204827831],[0.7907348242811502],[0.7463613773517928],[0.8006744763933261],...,[0.8056443024494143],[0.8006744763933261],[0.8070642527511538],[0.8084842030528931],[0.8006744763933261],[0.7985445509407171],[0.8020944266950657],[0.80386936457224],[0.8001419950301739],[0.7777777777777778]
precision_score_weighted,[0.7951439598243937],[0.7788557594928598],[0.7936225566732621],[0.7840966037509954],[0.7872954558286661],[0.7883626016478505],[0.7945578838065982],[0.7821042980085523],[0.8050897985624816],[0.7908282331973298],...,[0.797750247481125],[0.7918118778081197],[0.7984232043495402],[0.8008748735086266],[0.7892723592384945],[0.7892135819693169],[0.7930218868779982],[0.7962809885323318],[0.789309766378631],[0.7062369694839704]



Retrieve the Best Model

In [26]:

# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab/model_data
Downloaded azureml/ac07f62d-0d4c-4767-bd9a-ea7b4768c8ab/model_data, 1 files out of an estimated total of 1


In [27]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mn...
), random_state=None, reg_alpha=0.5789473684210527, reg_lambda=0.3157894736842105, subsample=1))], verbose=False))], flatten_transform=None, weights=[0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.2, 0.06666666666666667, 0.13333333333333333, 0.13333333333333333]))],
                                       'verbose': False},
                             y_transformer={},
                             y_transformer_name='LabelEncoder')

In [28]:
best_model.steps

[('datatransformer',
  DataTransformer(
      task='classification',
      is_onnx_compatible=False,
      enable_feature_sweeping=True,
      enable_dnn=False,
      force_text_dnn=False,
      feature_sweeping_timeout=86400,
      featurization_config=None,
      is_cross_validation=True,
      feature_sweeping_config={}
  )),
 ('prefittedsoftvotingclassifier',
  PreFittedSoftVotingClassifier(
      estimators=[('236', Pipeline(
          memory=None,
          steps=[('maxabsscaler', MaxAbsScaler(
              copy=True
          )), ('lightgbmclassifier', LightGBMClassifier(
              boosting_type='gbdt',
              colsample_bytree=0.2977777777777778,
              learning_rate=0.1,
              max_bin=180,
              max_depth=2,
              min_child_weight=3,
              min_data_in_leaf=1e-05,
              min_split_gain=0.5263157894736842,
              n_estimators=600,
              num_leaves=215,
              reg_alpha=1,
              reg_lambda=1,
 

## Test the model

Load Test Data

In [29]:
dataset_test = test_data
df_test = dataset_test.to_pandas_dataframe()
df_test = df_test[pd.notnull(df_test['Churn'])]

y_test = df_test['Churn']
X_test = df_test.drop(['Churn'], axis=1)

Testing Our Best Fitted Model

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
ypred = best_model.predict(X_test)
cm = confusion_matrix(y_test, ypred)
accuracy = accuracy_score(y_test, ypred)
auc = roc_auc_score(y_test, ypred)


In [31]:
# Visualize the confusion matrix
pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)

Unnamed: 0,0,1
0,940,95
1,174,200


In [32]:
print("Accuracy score is: ", accuracy)
print("AUC score is: ", auc)

Accuracy score is:  0.8090844570617459
AUC score is:  0.7214859593376217



## Publish and run from REST endpoint

In [40]:

published_pipeline = pipeline_run.publish_pipeline(
    name="Customer Churn Train", description="Training Customer Churn pipeline", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
Customer Churn Train,9ae34728-365f-4a77-b9dc-d5b12b57e791,Active,REST Endpoint


In [41]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

In [42]:
import requests

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": "pipeline-rest-endpoint"}
                        )

In [43]:
try:
    response.raise_for_status()
except Exception:    
    raise Exception("Received bad response from the endpoint: {}\n"
                    "Response Code: {}\n"
                    "Headers: {}\n"
                    "Content: {}".format(rest_endpoint, response.status_code, response.headers, response.content))

run_id = response.json().get('Id')
print('Submitted pipeline run: ', run_id)

Submitted pipeline run:  ab0b1b46-3654-430b-8d58-f91b266fc5ab


In [44]:

from azureml.pipeline.core.run import PipelineRun
from azureml.widgets import RunDetails

published_pipeline_run = PipelineRun(ws.experiments["pipeline-rest-endpoint"], run_id)
RunDetails(published_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …