In [1]:
import azureml.core

print("This notebook was created using version 1.0.23 of the Azure ML SDK")
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

This notebook was created using version 1.0.23 of the Azure ML SDK
You are currently using version 1.0.45 of the Azure ML SDK


In [2]:
import os
subscription_id = os.getenv("SUBSCRIPTION_ID", default="46fcf3a4-0d3b-478f-ac67-9cd2d0a69056")
resource_group = os.getenv("RESOURCE_GROUP", default="automl_group")
workspace_name = os.getenv("WORKSPACE_NAME", default="automl_ws")
workspace_region = os.getenv("WORKSPACE_REGION", default="eastus2")


In [3]:
from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

try:
    interactive_auth = InteractiveLoginAuthentication(tenant_id="104d804d-6298-4cb6-8de3-22ddc7db5aa0")
    ws = Workspace(subscription_id = subscription_id, 
               resource_group = resource_group, 
               workspace_name = workspace_name,
               auth=interactive_auth)    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded.")
except:
    print("Workspace not accessible.")

Workspace configuration succeeded.


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Found existing cpucluster")
except ComputeTargetException:
    print("Creating new cpucluster")
    
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                           min_nodes=0,
                                                           max_nodes=4)

    # Create the cluster with the specified name and configuration
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
    # Wait for the cluster to complete, show the output log
    cpu_cluster.wait_for_completion(show_output=True)

Found existing cpucluster


# Load data


In [5]:
import os
#dataset_name = "defect_camel_1_2"
dataset_name = "defect_eclipse_2_0"
#dataset_name = "defect_eclipse_3_0"
#dataset_name = "defect_prop_2"
#dataset_name = "defect_xalan_2_6"
dataset_file_name = dataset_name+'.csv'
dataset_path_name = os.path.join(os.getcwd(),dataset_name+'.csv' )

In [6]:
import pandas as pd
df = pd.read_csv(dataset_file_name)
x_df = df.drop(columns =['target'])
y_df = df[['target']]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=42)
# flatten y_train to 1d array
y_train.values.flatten()# flatten y_train to 1d array


array([False, False, False, ..., False, False, False])


# AutoML start

In [7]:
import azureml.core
import pandas as pd
from azureml.core.workspace import Workspace
import logging
import os

In [8]:
ws = Workspace.from_config()
# choose a name for the run history container in the workspace
experiment_name = 'Automl_'+dataset_name
# project folder
project_folder = './Automl/'+dataset_name

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
SDK version,1.0.45
Subscription ID,46fcf3a4-0d3b-478f-ac67-9cd2d0a69056
Workspace,automl_ws
Resource Group,automl_group
Location,canadacentral
Project Directory,./Automl/defect_eclipse_2_0


In [9]:
automl_settings = {
    "iteration_timeout_minutes" : 60,
    "iterations" : 100,
    "primary_metric" : 'AUC_weighted',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 100,
    "validation_size": 0.67,
    "enable_voting_ensemble":False,
    "enable_stack_ensemble":False,
    "model_explainability":True,
}

In [10]:
from azureml.train.automl import AutoMLConfig

# local compute 
automated_ml_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automated_ml_errors'+dataset_name+'.log',
                             path = project_folder,
                             X = x_train,
                             y = y_train.values.flatten(),
                             **automl_settings)

In [11]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, experiment_name)
remote_run = experiment.submit(automated_ml_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_09ee3dee-6116-4c5b-bf1a-e2e4096ea5bf
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.


KeyboardInterrupt: 

In [None]:
remote_run 

In [192]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET', …

In [13]:
from azureml.widgets import RunDetails
from azureml.train.automl.run import AutoMLRun
from azureml.core.experiment import Experiment
experiment=Experiment(ws, experiment_name)

remote_run = AutoMLRun(experiment = experiment, run_id ="AutoML_7a713f19-2b48-4ed3-a4e1-e523f51cecc8")

RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

# Fetch best model

In [14]:
#iteration = 15
#b_run, b_model = remote_run.get_output(iteration=iteration)

b_run, b_model = remote_run.get_output()
print('Algorithm: '+b_run.get_properties()['run_algorithm'])
print(b_run)
print(b_model)

Package:azureml-core, training version:1.0.43.1, current version:1.0.45
Package:azureml-explain-model, training version:1.0.43, current version:1.0.45
Package:azureml-pipeline, training version:1.0.43, current version:1.0.45
Package:azureml-pipeline-core, training version:1.0.43, current version:1.0.45
Package:azureml-pipeline-steps, training version:1.0.43, current version:1.0.45
Package:azureml-sdk, training version:1.0.43, current version:1.0.45
Package:azureml-telemetry, training version:1.0.43.1, current version:1.0.45
Package:azureml-train, training version:1.0.43, current version:1.0.45
Package:azureml-train-automl, training version:1.0.43.1, current version:1.0.45.1
Package:azureml-train-core, training version:1.0.43, current version:1.0.45
Package:azureml-train-restclients-hyperdrive, training version:1.0.43, current version:1.0.45
Package:azureml-widgets, training version:1.0.43.1, current version:1.0.45.1

Try running `pip install --upgrade azureml-sdk[automl]==1.0.43.1`




Algorithm: ExtremeRandomTrees
Run(Experiment: Automl_defect_eclipse_2_0,
Id: AutoML_7a713f19-2b48-4ed3-a4e1-e523f51cecc8_28,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_feature_sweeping=None, feature_sweeping_timeout=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('MaxAbsScaler', MaxAbsScaler(copy=True)), ('ExtraTreesClassifier', ExtraTreesClassifier(bootstrap=False, class_weight=Non...imators=600, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])
Y_transformer(['LabelEncoder', LabelEncoder()])


# Register model

In [None]:
modelname = 'AML'+dataset_name.split('defect_')[1]+'_'+b_run.get_properties()['run_algorithm']
b_run.register_model(model_name=modelname, model_path = 'outputs/model.pkl')

In [167]:
from azureml.core import Workspace
from azureml.core.model import Model
ws = Workspace.from_config()
b_model=Model(ws, modelname)
model_path=model.download(target_dir=os.path.join(os.getcwd(),'aml_model/'+dataset_name.split('defect_')[1]+'/'+b_run.get_properties()['run_algorithm']), exist_ok=True)
import pickle
b_model = pickle.load(open(model_path, 'rb'))

# Evaluate the best model on test dataset

In [16]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(b_model).fit(x_test, y_test.values)
eli5.show_weights(perm)

Weight,Feature
0.0129  ± 0.0060,x0
0.0062  ± 0.0038,x31
0.0058  ± 0.0017,x9
0.0043  ± 0.0027,x10
0.0042  ± 0.0026,x28
0.0039  ± 0.0011,x26
0.0039  ± 0.0024,x7
0.0037  ± 0.0047,x4
0.0027  ± 0.0031,x23
0.0027  ± 0.0007,x24


In [None]:
from sklearn.metrics import auc,accuracy_score,roc_auc_score,auc,roc_curve
from sklearn.utils import resample
sample_size = int(y_test.shape[0]*0.1)
boot_size = 100
auc_weighted_list=[]
for ite in range(1,boot_size):
    resample_x, resampel_y = resample(x_test,y_test
                                      #, n_samples=sample_size
                                      , replace=True, 
          random_state=ite)
    predicted_y = b_model.predict_proba(resample_x)
    auc_weighted=roc_auc_score(resampel_y,predicted_y[:,1],average='weighted')
    if auc_weighted < 0.5: 
        auc_weighted = 1-auc_weighted
    auc_weighted_list.append(auc_weighted)
    
    
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
plt.rcParams["figure.figsize"] = (5, 3)
fig1, ax1 = plt.subplots()
ax1.set_title('AML: distribution of XGBoost performance on Eclipse_2.0')
bp_dict = ax1.boxplot(auc_weighted_list,vert=False)
for line in bp_dict['medians']:
    # get position data for median line
    x, y = line.get_xydata()[1] # top of median line
    # overlay median value
    text(x, y, round(x,4),
         horizontalalignment='center') # draw above, centered
