# Data preparation

In [301]:
data_map = {'defect_eclipse_2_0' : "https://raw.githubusercontent.com/yiikou/AML_vs_HPT/master/data/defect_eclipse_2_0.csv",
           'defect_camel_1_2':'https://raw.githubusercontent.com/yiikou/AML_vs_HPT/master/data/defect_camel_1_2.csv',
           'defect_eclipse_3_0':'https://raw.githubusercontent.com/yiikou/AML_vs_HPT/master/data/defect_eclipse_3_0.csv',
           'defect_prop_2':'https://raw.githubusercontent.com/yiikou/AML_vs_HPT/master/data/defect_prop_2.csv',
           'defect_xalan_2_6':'https://raw.githubusercontent.com/yiikou/AML_vs_HPT/master/data/defect_xalan_2_6.csv'}

In [302]:
import os
#dataset_name = "defect_camel_1_2"
#dataset_name = "defect_eclipse_2_0"
#dataset_name = "defect_eclipse_3_0"
#dataset_name = "defect_prop_2"
dataset_name = "defect_xalan_2_6"
dataset_file_name = dataset_name+'.csv'
dataset_path_name = os.path.join(os.getcwd(),'data/'+dataset_name+'.csv' )

In [303]:
dataset_path_name

'/home/local/SAIL/jzhou/AML_vs_HPT/data/defect_xalan_2_6.csv'

In [304]:
import pandas as pd
df = pd.read_csv(dataset_path_name)
x_df = df.drop(columns =['target'])
y_df = df[['target']]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=42)
# flatten y_train to 1d array
#y_train.values.flatten()# flatten y_train to 1d array


# Setting 

In [305]:
import azureml.core
import logging

print("SDK version:", azureml.core.VERSION)

from azureml.telemetry import set_diagnostics_collection
set_diagnostics_collection(send_diagnostics=True)

import os
subscription_id = os.getenv("SUBSCRIPTION_ID", default="52f3cf55-fed4-4f7e-9aca-f3da535a03c1")
resource_group = os.getenv("RESOURCE_GROUP", default="automl_rg")
workspace_name = os.getenv("WORKSPACE_NAME", default="automl_ws")
workspace_region = os.getenv("WORKSPACE_REGION", default="eastus2")


from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

try:
    interactive_auth = InteractiveLoginAuthentication(tenant_id="1591aa04-9c2a-4144-9a04-fb8b0d506de5")
    ws = Workspace(subscription_id = subscription_id, 
               resource_group = resource_group, 
               workspace_name = workspace_name,
               auth=interactive_auth)    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded.")
except:
    print("Workspace not accessible.")

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster1"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Found existing cpucluster")
except ComputeTargetException:
    print("Creating new cpucluster")
    
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                           min_nodes=0,
                                                           max_nodes=4)

    # Create the cluster with the specified name and configuration
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
    # Wait for the cluster to complete, show the output log
    cpu_cluster.wait_for_completion(show_output=True)

SDK version: 1.0.45
Turning diagnostics collection on. 
Workspace configuration succeeded.
Found existing cpucluster


# Model setting

In [306]:
# should be enum
candidate_algorithem = ['xgboost','randomforest','logistic_regression','fft']

chosen_algo = 'logistic_regression'


estimator_map  = {'xgboost':'xgb_train.py',
                  'randomforest':'rf_train.py',
                  'logistic_regression':'lr_train.py',
                  'fft':'fft_train.py'    
                 }

# Estimator

In [307]:
# from azureml.train.sklearn import SKLearn
# = SKLearn(
from azureml.train.estimator import Estimator
script_params = {
        "--split-random-seeds":42, # seed for splitting data
        "--data-download-url" : data_map[dataset_name], # data download url
        "--dataset-name" :dataset_name,
        '--output_dir': './outputs'
}

estimator = Estimator(source_directory='./estimator',
                   script_params=script_params,
                   compute_target=cpu_cluster,
                   entry_script=estimator_map[chosen_algo],
                   conda_packages=['scikit-learn','pandas','py-xgboost=0.80']
                    )

# Hyperparameter space

In [308]:
from azureml.train.hyperdrive import RandomParameterSampling
from azureml.train.hyperdrive import choice, uniform

# xgboost
xgb_param_sampling  = RandomParameterSampling( 
    {
        "max-depth":choice(range(1,10)),
        "eta":uniform(0.001,6),
        #'learning_rate': uniform(0.01,0.1),
        'subsample': uniform(0.25,1),
        'colsample_bytree': uniform(0.3,0.7),
        'gamma': uniform(0,10),
        'min_child_weight':choice(range(0,20)),
        'n_estimators':choice(range(1,1000))
        
        #nrounds = sample(1:1000, size = len, replace = TRUE),
        #max_depth = sample(1:10, replace = TRUE, size = len),
        #eta = runif(len, min = .001, max = .6),
        #gamma = runif(len, min = 0, max = 10),
        #colsample_bytree = runif(len, min = .3, max = .7),
        #min_child_weight = sample(0:20, size = len, replace = TRUE),
        #subsample = runif(len, min = .25, max = 1)
        
    }
)

# randomforest
rf_param_sampling  = RandomParameterSampling( 
    {
        "mtry":choice(range(1,len(x_train.columns)))
    }
)

# logistic regression
lr_param_sampling  = RandomParameterSampling( 
    {
        "random_state":choice(range(1,100))
    }
)

# fft
fft_param_sampling  = RandomParameterSampling( 
    {
        "alpha":choice(range(3,10))
    }
)
param_sampling_map  = {'xgboost':xgb_param_sampling,
                  'randomforest':rf_param_sampling,
                  'logistic_regression':lr_param_sampling,
                  'fft':fft_param_sampling    
                 }

In [309]:
import pandas as pd

ws = Workspace.from_config()
# choose a name for the run history container in the workspace
experiment_name = 'H_'+dataset_name.split('defect_')[1]+"_"+chosen_algo
# project folder
project_folder = './HyperTuning/'+dataset_name+'/'+chosen_algo

output = {}
#output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment_name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
Subscription ID,52f3cf55-fed4-4f7e-9aca-f3da535a03c1
Workspace,automl_ws
Resource Group,automl_rg
Location,canadacentral
Project Directory,./HyperTuning/defect_xalan_2_6/logistic_regression
Experiment Name,H_xalan_2_6_logistic_regression


# Specify primary metric & early termination policy = no

In [310]:
from azureml.train.hyperdrive import PrimaryMetricGoal
primary_metric_name="auc_weighted" #should be exactly match the name of the metric logged by the training script
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

# Bandit policy
# Truncation selection policy
# Median stopping policy

# No termination policy
early_termination_policy=None


max_total_runs=100
max_concurrent_runs=4

# Configure experiment

from azureml.train.hyperdrive import HyperDriveConfig
hyperdrive_run_config = HyperDriveConfig(estimator=estimator,
                          hyperparameter_sampling=param_sampling_map[chosen_algo], 
                          policy=early_termination_policy, # which is None
                          primary_metric_name=primary_metric_name, 
                          primary_metric_goal=primary_metric_goal,
                          max_total_runs=max_total_runs,
                          max_concurrent_runs=max_concurrent_runs)

# Submit experiment or

In [311]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, experiment_name)
hyperdrive_run = experiment.submit(hyperdrive_run_config,tags={'Algorithm':chosen_algo})

In [316]:
hyperdrive_run 

Experiment,Id,Type,Status,Details Page,Docs Page
H_eclipse_2_0_xgboost,H_eclipse_2_0_xgboost_1561759477871,hyperdrive,Completed,Link to Azure Portal,Link to Documentation


# Fetch a specific run

In [326]:
from azureml.train.hyperdrive import HyperDriveRun
from azureml.core.experiment import Experiment
experiment_name='H_camel_1_2_randomforest'
experiment=Experiment(ws, experiment_name)
hyperdrive_run = HyperDriveRun(experiment = experiment, run_id ="H_camel_1_2_randomforest_1561751966406")


In [327]:
from azureml.widgets import RunDetails
#print(hyperdrive_run.get_details())
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO',…

In [318]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print('Algorithm: '+chosen_algo)
print(best_run_metrics['hyper_param'])
print(best_run_metrics['auc_weighted'])

Algorithm: logistic_regression
{'max_depth': 5, 'eta': 5.30715931194066, 'silent': 1, 'subsample': 0.830657629802624, 'colsample_bytree': 0.584971596458506, 'gamma': 6.09027908137873, 'min_child_weight': 2, 'n_estimators': 246}
0.8442904180828843


# Register model to workspace

In [None]:
modelname = 'HYPER_'+dataset_name.split('defect_')[1]+'_'+chosen_algo
model = best_run.register_model(model_name =modelname, model_path = 'outputs/model.pkl')
print(model.name)

In [None]:
from azureml.core import Workspace
from azureml.core.model import Model
ws = Workspace.from_config()
model=Model(ws, modelname)
model_path=model.download(target_dir=os.path.join(os.getcwd(),'hyper_model/'+dataset_name.split('defect_')[1]+'/'+chosen_algo), exist_ok=True)
import pickle
import xgboost
b_model = pickle.load(open(model_path, 'rb'))

# Evaluate performance (AUC_weighted) on test dataset using Bootstrap approach

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(b_model).fit(x_test, y_test.values)
eli5.show_weights(perm)

In [None]:
from sklearn.metrics import auc,accuracy_score,roc_auc_score,auc,roc_curve
from sklearn.utils import resample
sample_size = int(y_test.shape[0]*0.1)
boot_size = 100
auc_weighted_list=[]
for ite in range(1,boot_size):
    resample_x, resampel_y = resample(x_test,y_test
                                      #, n_samples=sample_size
                                      , replace=True, 
          random_state=ite)
    predicted_y = b_model.predict_proba(resample_x)
    auc_weighted=roc_auc_score(resampel_y,predicted_y[:,1],average='weighted')
    if auc_weighted < 0.5: 
        auc_weighted = 1-auc_weighted
    auc_weighted_list.append(auc_weighted)
    #print(auc_weighted)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pylab import *

plt.rcParams["figure.figsize"] = (5, 3)
fig1, ax1 = plt.subplots()
ax1.set_title('HT: distribution of XGBoost performance on Eclipse_2.0')
bp_dict = ax1.boxplot(auc_weighted_list,vert=False)
for line in bp_dict['medians']:
    # get position data for median line
    x, y = line.get_xydata()[1] # top of median line
    # overlay median value
    text(x, y, round(x,4),
         horizontalalignment='center') # draw above, centered