In [1]:
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.core import Datastore, Experiment

from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput, PipelineRun
import azureml.train.automl

from azureml.core.model import Model 
from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import AutoMLStep

from azureml.widgets import RunDetails

In [2]:
import pandas as pd
import gc
import warnings
import importlib
import json

warnings.filterwarnings('ignore')

In [3]:
!pip install azureml-train-automl-runtime



## Identify Azure ML Workspace by configuration

In [4]:
# Initiate default workspace
ws = Workspace.from_config()

# Default datastore
def_blob_store = ws.get_default_datastore()

## Select Compute Target

In [5]:
cpu_cluster_name = "ml-dev-clus"

In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found an existing cluster, using it instead.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           min_nodes=0,
                                                           max_nodes=6)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    cpu_cluster.wait_for_completion(show_output=True)

Found an existing cluster, using it instead.


## Create Running Environment - Docker

In [7]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE, DockerConfiguration

# create a new runconfig object
run_config = RunConfiguration()

# enable Docker 
docker_config = DockerConfiguration(use_docker=True)
run_config.docker = docker_config

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['pandas','scikit-learn','numpy'],
                                                                            pip_packages=['azureml-sdk[automl]'])


## Initiate Input Data Channel
- train_transaction
- train_identity

In [8]:
train_transaction = Dataset.get_by_name(ws, name='CIS Fraud Detection_train_transaction')
train_identity = Dataset.get_by_name(ws, name = 'CIS Fraud Detection_train_identity')

## Pipeline
### Pipeline Step 1 : Clean Data

In [9]:
source_directory="./data_prep"

cleaned_data = PipelineData("cleaned_data", datastore=def_blob_store).as_dataset()

CleanStep = PythonScriptStep(
    script_name="clean_data.py", 
    arguments=["--output_combine", cleaned_data],
    inputs=[train_transaction.as_named_input('input_transaction'), train_identity.as_named_input('input_identity')],
    outputs= [cleaned_data],
    compute_target=cpu_cluster, 
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse = True
)

### Pipeline Step 2 : Select Columns

In [10]:
source_directory="./data_prep"

selected_data = PipelineData("selected_data", datastore=def_blob_store).as_dataset()
SelectedStep = PythonScriptStep(
    script_name="select_col.py", 
    arguments=["--output_selected", selected_data],
    inputs=[cleaned_data.parse_parquet_files()],
    outputs= [selected_data],
    compute_target=cpu_cluster, 
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse = True
)

### Pipeline Step 3 : Feature Engineering

In [11]:
source_directory="./data_prep"

train_data = PipelineData("train_data", datastore=def_blob_store).as_dataset() ## Last input for machine learning model

FeatureEngineeringStep = PythonScriptStep(
    script_name="feature_engineering.py", 
    arguments=["--output_train_data", train_data],
    inputs=[selected_data.parse_parquet_files()],
    outputs= [train_data],
    compute_target=cpu_cluster, 
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse = True
)

## Create Experiment Environment

In [12]:
automl_fraud_pipeline = Experiment(ws, 'fraud_detection_automl_pipeline')

## AutoML Configurations
- Classification Model

In [14]:
from azureml.train.automl import AutoMLConfig

model_folder = "./train_model"

automl_settings = {
    "iteration_timeout_minutes" :120,
    "experiment_timeout_hours" : 2,
    "iterations" : 10,
    "max_concurrent_iterations" : 4,
    "primary_metric" : "AUC_weighted",
    "n_cross_validations" : 4
}

train_dataset = train_data.parse_parquet_files()

automl_config = AutoMLConfig(task = "classification",
                             debug_log = 'automated_ml_errors.log',
                             path = model_folder,
                             compute_target = cpu_cluster,
                             featurization = 'off',
                             training_data = train_dataset,   ## Input from previous pipeline
                             label_column_name = 'isFraud',   ## Target prediction column name
                             allow_reuse = True,
                             **automl_settings)

## Create output of AutoML
- Metrics data
- Model data

In [15]:
automl_metrics_output_name = 'metrics_output'
automl_best_model_outputname = 'best_model_output'

metrics_data = PipelineData(name = "metrics_data",
                        datastore = def_blob_store,
                        pipeline_output_name = automl_metrics_output_name,
                        training_output = TrainingOutput(type = "Metrics"))

model_data = PipelineData(name = "model_data",
                          datastore = def_blob_store,
                          pipeline_output_name = automl_best_model_outputname,
                          training_output = TrainingOutput(type= "Model"))

In [16]:
fraud_automl = AutoMLStep(name = "AutoML_FraudDetect",
                          automl_config = automl_config,
                          outputs = [metrics_data, model_data],
                          allow_reuse = True)

## Create Pipeline Object

In [17]:
pipeline_steps = [fraud_automl]
pipeline_automl = Pipeline(workspace = ws, steps = pipeline_steps)

automl_pipeline_run = automl_fraud_pipeline.submit(pipeline_automl, regenerate_outputs = False)



Created step AutoML_FraudDetect [909a7124][dcb32f70-3660-4740-9dfa-6eb2634f8d69], (This step will run and generate new outputs)Created step feature_engineering.py [a76dab00][47b064f3-383e-4673-abbc-8c639a2f6ccd], (This step is eligible to reuse a previous run's output)

Created step select_col.py [ed45dcdd][b3fdb1e7-9b65-4484-9f0b-4acb5e980ee2], (This step is eligible to reuse a previous run's output)
Created step clean_data.py [c4648581][f98cf3d5-ac90-495d-bb39-f8b9b3803820], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun b4554fa5-9c08-47cd-8eff-cf51b9ef9af3
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/b4554fa5-9c08-47cd-8eff-cf51b9ef9af3?wsid=/subscriptions/2186b060-2874-42c6-b0f7-0335ccdedb37/resourcegroups/azure-ml-eng-dev/workspaces/azureml-eng-dev-generic&tid=271d5e7b-1350-4b96-ab84-52dbda4cf40c


## RunDetails

In [18]:
RunDetails(automl_pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [13]:
for pipeline_run in automl_fraud_pipeline.get_runs(include_children= False):
    pipeline_run_id = pipeline_run.id

automl_pipeline_run =  PipelineRun(automl_fraud_pipeline, pipeline_run_id)

for attr in automl_pipeline_run.find_step_run('AutoML_FraudDetect'):
    automl_pipeline_id = attr.id

In [15]:
from azureml.train.automl.run import AutoMLRun

automl_run = AutoMLRun(automl_fraud_pipeline, automl_pipeline_id)

## Examine Results

In [19]:
from azureml.train.automl.run import AutoMLRun
#from azureml.widgets import RunDetails

# workaround to get the automl run as its the last step in the pipeline 
# and get_steps() returns the steps from latest to first

for step in automl_pipeline_run.get_steps():
    automl_step_run_id = step.id
    print(step.name)
    print(automl_step_run_id)
    print('')
    break

automl_run = AutoMLRun(experiment = automl_fraud_pipeline, run_id=automl_step_run_id)
RunDetails(automl_run).show()

AutoML_FraudDetect
9dc9aa76-9c6a-4148-bb63-f290607c9ad0



_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [None]:
# Retrieve best model from Pipeline Run
best_model_output = automl_pipeline_run.get_pipeline_output(automl_best_model_outputname)
num_file_downloaded = best_model_output.download('.', show_progress=True)

In [21]:
best_model_output

Name,Datastore,Path on Datastore,Produced By PipelineRun
best_model_output,workspaceblobstore,azureml/9dc9aa76-9c6a-4148-bb63-f290607c9ad0/model_data,b4554fa5-9c08-47cd-8eff-cf51b9ef9af3


In [23]:
# Properties of best run model
run_properties = json.loads(best_run.get_details()['properties']['pipeline_script'])
print(json.dumps(run_properties, indent = 1))

{
 "pipeline_id": "__AutoML_Stack_Ensemble__",
 "objects": [
  {
   "module": "azureml.train.automl.stack_ensemble",
   "class_name": "StackEnsemble",
   "spec_class": "sklearn",
   "param_args": [],
   "param_kwargs": {
    "automl_settings": "{'task_type':'classification','primary_metric':'AUC_weighted','verbosity':20,'ensemble_iterations':10,'is_timeseries':False,'name':'placeholder','compute_target':'ml-dev-clus','subscription_id':'2186b060-2874-42c6-b0f7-0335ccdedb37','region':'southeastasia','spark_service':None}",
    "ensemble_run_id": "9dc9aa76-9c6a-4148-bb63-f290607c9ad0_9",
    "experiment_name": "fraud_detection_automl_pipeline",
    "workspace_name": "azureml-eng-dev-generic",
    "subscription_id": "2186b060-2874-42c6-b0f7-0335ccdedb37",
    "resource_group_name": "azure-ml-eng-dev"
   }
  }
 ]
}


###

## Retrive Best Model Performance

In [20]:
best_run, automl_mdl = automl_run.get_output()

print(best_run)
print('')
print(automl_mdl)



Run(Experiment: fraud_detection_automl_pipeline,
Id: 9dc9aa76-9c6a-4148-bb63-f290607c9ad0_9,
Type: azureml.scriptrun,
Status: Completed)

Pipeline(memory=None,
         steps=[('stackensembleclassifier',
                 StackEnsembleClassifier(base_learners=[('0', Pipeline(memory=None, steps=[('MaxAbsScaler', MaxAbsScaler(copy=True)), ('LightGBMClassifier', LightGBMClassifier(min_data_in_leaf=20, n_jobs=1, problem_info=ProblemInfo(
    gpu_training_param_dict={'processing_unit_type': 'cpu'}
), random_state=None))], verbose=Fa...
), random_state=0, tree_method='auto'))], verbose=False))], meta_learner=LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False, fit_intercept=True, intercept_scaling=1.0, l1_ratios=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, refit=True, scoring=Scorer(
    metric='AUC_weighted'
), solver='lbfgs', tol=0.0001, verbose=0), training_cv_folds=5))],
         verbose=False)


In [19]:
automl_run.wait_for_completion(show_output= True)

Experiment,Id,Type,Status,Details Page,Docs Page
fraud_detection_automl_pipeline,9dc9aa76-9c6a-4148-bb63-f290607c9ad0,azureml.StepRun,Completed,Link to Azure Machine Learning studio,Link to Documentation




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       ALERTED
DESCRIPTION:  The training data contains missing values but no imputation is done. If missing values are not expected, cancel the current run to review your data for data quality issues or set the featurization setting for AutoML to handle the imputation.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy becaus



{'runId': '9dc9aa76-9c6a-4148-bb63-f290607c9ad0',
 'target': 'ml-dev-clus',
 'status': 'Completed',
 'startTimeUtc': '2021-10-24T03:32:27.009645Z',
 'endTimeUtc': '2021-10-24T04:48:42.474561Z',
 'services': {},
 'properties': {'ContentSnapshotId': 'cd85412c-fbcd-4689-911a-226351b5be4f',
  'StepType': 'AutoMLStep',
  'azureml.moduleid': 'dcb32f70-3660-4740-9dfa-6eb2634f8d69',
  'azureml.runsource': 'azureml.StepRun',
  'azureml.nodeid': '909a7124',
  'azureml.pipelinerunid': 'b4554fa5-9c08-47cd-8eff-cf51b9ef9af3',
  'azureml.pipeline': 'b4554fa5-9c08-47cd-8eff-cf51b9ef9af3',
  'azureml.pipelineComponent': 'masterautomlcloud',
  'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'metrics': 'accuracy',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'MaxTimeSeconds': '7200',
  'acquisition_parameter': '0',
  'num_cross_validation': '4',
  'target': 'ml-dev-clus',
  'RawAMLSettingsString': None,
  'DataPrepJsonString': '{\\"training_data\\

### Dowload scoring script (score.py) from model

In [21]:
script_file_name = "auto_ml_inference/score.py"
best_run.download_file("outputs/scoring_file_v_1_0_0.py", "auto_ml_inference/score.py")

In [23]:
model_name = "AutomlFraudDetection"
description = "AutoML Model trained on IEEE Fraud Detection Dataset"
tags = None

model = automl_run.register_model(
    model_name=model_name, description=description, tags=tags
)

print(
    automl_run.model_id
) 



AutomlFraudDetection


In [24]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

inference_config = InferenceConfig(entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=2,
    memory_gb=2,
    tags={"area": "IEEE_CIS_FraudData", "type": "automl_classification"},
    description="Deployed Automl for Fraud Detection",
    auth_enabled=True
)

aci_service_name = model_name.lower()
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

automlfrauddetection
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-10-28 12:35:17+00:00 Creating Container Registry if not exists.
2021-10-28 12:35:18+00:00 Use the existing image.
2021-10-28 12:35:19+00:00 Generating deployment configuration.
2021-10-28 12:35:20+00:00 Submitting deployment to compute.
2021-10-28 12:35:26+00:00 Checking the status of deployment automlfrauddetection..
2021-10-28 12:37:39+00:00 Checking the status of inference endpoint automlfrauddetection.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


## Testing Endpoints

### Unable application insight on Endpoints

In [13]:
from azureml.core.webservice import Webservice

aci_service= Webservice(ws, "automlfrauddetection")

In [29]:
aci_service.update(enable_app_insights = True)

### Clean up Web Service

In [14]:
aci_service.delete()

In [16]:
aci_service.state

'Deleting'