In [1]:
import azureml.core

print("This notebook was created using version 1.0.23 of the Azure ML SDK")
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

This notebook was created using version 1.0.23 of the Azure ML SDK
You are currently using version 1.0.39 of the Azure ML SDK


In [2]:
import os

subscription_id = os.getenv("SUBSCRIPTION_ID", default="942813ed-26a2-4790-9097-d4be7560fce0")
resource_group = os.getenv("RESOURCE_GROUP", default="ICSE-tutorial")
workspace_name = os.getenv("WORKSPACE_NAME", default="workspace_fresher")
workspace_region = os.getenv("WORKSPACE_REGION", default="eastus2")

In [3]:
from azureml.core import Workspace

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded.")
except:
    print("Workspace not accessible.")

Workspace configuration succeeded.


### Create compute resources for your training experiments

Many of the sample notebooks use Azure ML managed compute (AmlCompute) to train models using a dynamically scalable pool of compute. In this section you will create default compute clusters for use by the other notebooks and any other operations you choose.

To create a cluster, you need to specify a compute configuration that specifies the type of machine to be used and the scalability behaviors.  Then you choose a name for the cluster that is unique within the workspace that can be used to address the cluster later.

The cluster parameters are:
* vm_size - this describes the virtual machine type and size used in the cluster.  All machines in the cluster are the same type.  You can get the list of vm sizes available in your region by using the CLI command

```shell
az vm list-skus -o tsv
```
* min_nodes - this sets the minimum size of the cluster.  If you set the minimum to 0 the cluster will shut down all nodes while note in use.  Setting this number to a value higher than 0 will allow for faster start-up times, but you will also be billed when the cluster is not in use.
* max_nodes - this sets the maximum size of the cluster.  Setting this to a larger number allows for more concurrency and a greater distributed processing of scale-out jobs.


To create a **CPU** cluster now, run the cell below. The autoscale settings mean that the cluster will scale down to 0 nodes when inactive and up to 4 nodes when busy.

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Found existing cpucluster")
except ComputeTargetException:
    print("Creating new cpucluster")
    
    # Specify the configuration for the new cluster
    compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                           min_nodes=0,
                                                           max_nodes=4)

    # Create the cluster with the specified name and configuration
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
    # Wait for the cluster to complete, show the output log
    cpu_cluster.wait_for_completion(show_output=True)

Found existing cpucluster


In [5]:
import azureml.dataprep as dprep

# Load data
Download two different NYC Taxi data sets into dataflow objects. These datasets contain slightly different fields. The method auto_read_file() automatically recognizes the input file type.

In [6]:
from IPython.display import display
import os
eclipse_2_0_raw = dprep.read_csv(path=os.path.join(os.getcwd(), "defect_eclipse_2_0.csv"), header=dprep.PromoteHeadersMode.GROUPED)

#eclipse_2_0_raw=eclipse_2_0_raw.drop_columns('Column1')
display(eclipse_2_0_raw.head(5))


Unnamed: 0,pre,ACD,FOUT_avg,FOUT_max,FOUT_sum,MLOC_avg,MLOC_max,MLOC_sum,NBD_avg,NBD_max,...,NSM_max,NSM_sum,PAR_avg,PAR_max,PAR_sum,TLOC,CC_avg,CC_max,CC_sum,post
0,1,0,6.75,29,54,9.25,32,74,1.75,5,...,0,0,2.0,4,16,128,2.875,7,23,False
1,1,0,12.5,13,25,16.0,18,32,2.0,3,...,0,0,3.0,4,6,55,4.0,4,8,False
2,0,0,5.33333333333333,10,16,12.6666666666667,29,38,3.0,6,...,0,0,1.0,2,3,70,4.33333333333333,9,13,False
3,2,0,7.33333333333333,16,88,9.66666666666667,28,116,2.08333333333333,5,...,0,0,1.33333333333333,2,16,174,2.83333333333333,8,34,False
4,2,4,6.21052631578947,27,118,9.89473684210526,55,188,1.78947368421053,4,...,1,1,0.578947368421053,2,11,277,2.42105263157895,11,46,False


In [7]:
eclipse_2_0_raw.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent Missing,Error Count,Empty Count,Unique Values,0.1% Quantile (est.),1% Quantile (est.),5% Quantile (est.),25% Quantile (est.),50% Quantile (est.),75% Quantile (est.),95% Quantile (est.),99% Quantile (est.),99.9% Quantile (est.),Mean,Standard Deviation,Variance,Skewness,Kurtosis
pre,FieldType.STRING,0,9,6729.0,0.0,6729.0,0.0,0.0,0.0,36.0,,,,,,,,,,,,,,
ACD,FieldType.STRING,0,9,6729.0,0.0,6729.0,0.0,0.0,0.0,20.0,,,,,,,,,,,,,,
FOUT_avg,FieldType.STRING,0,9.92857142857143,6729.0,0.0,6729.0,0.0,0.0,0.0,,,,,,,,,,,,,,,
FOUT_max,FieldType.STRING,0,97,6729.0,0.0,6729.0,0.0,0.0,0.0,108.0,,,,,,,,,,,,,,
FOUT_sum,FieldType.STRING,0,99,6729.0,0.0,6729.0,0.0,0.0,0.0,378.0,,,,,,,,,,,,,,
MLOC_avg,FieldType.STRING,0,96.1666666666667,6729.0,0.0,6729.0,0.0,0.0,0.0,,,,,,,,,,,,,,,
MLOC_max,FieldType.STRING,0,994,6729.0,0.0,6729.0,0.0,0.0,0.0,178.0,,,,,,,,,,,,,,
MLOC_sum,FieldType.STRING,0,998,6729.0,0.0,6729.0,0.0,0.0,0.0,563.0,,,,,,,,,,,,,,
NBD_avg,FieldType.STRING,0,7,6729.0,0.0,6729.0,0.0,0.0,0.0,694.0,,,,,,,,,,,,,,
NBD_max,FieldType.STRING,0,9,6729.0,0.0,6729.0,0.0,0.0,0.0,13.0,,,,,,,,,,,,,,


In [8]:
type_infer = eclipse_2_0_raw.builders.set_column_types()
type_infer.learn()
type_infer

Column types conversion candidates:
'FOUT_max': [FieldType.INTEGER],
'MLOC_max': [FieldType.INTEGER],
'NBD_max': [FieldType.INTEGER],
'NOF_sum': [FieldType.INTEGER],
'NOT': [FieldType.INTEGER],
'NSM_avg': [FieldType.DECIMAL],
'PAR_max': [FieldType.INTEGER],
'CC_max': [FieldType.INTEGER],
'FOUT_avg': [FieldType.DECIMAL],
'MLOC_sum': [FieldType.INTEGER],
'NOF_avg': [FieldType.DECIMAL],
'NOM_avg': [FieldType.DECIMAL],
'NSF_avg': [FieldType.DECIMAL],
'NSM_max': [FieldType.INTEGER],
'PAR_sum': [FieldType.INTEGER],
'CC_sum': [FieldType.INTEGER],
'pre': [FieldType.INTEGER],
'MLOC_avg': [FieldType.DECIMAL],
'NBD_sum': [FieldType.INTEGER],
'NOI': [FieldType.INTEGER],
'NOM_sum': [FieldType.INTEGER],
'NSF_sum': [FieldType.INTEGER],
'PAR_avg': [FieldType.DECIMAL],
'CC_avg': [FieldType.DECIMAL],
'ACD': [FieldType.INTEGER],
'FOUT_sum': [FieldType.INTEGER],
'NBD_avg': [FieldType.DECIMAL],
'NOF_max': [FieldType.INTEGER],
'NOM_max': [FieldType.INTEGER],
'NSF_max': [FieldType.INTEGER],
'NSM_sum': [Field

In [9]:

type_converted_df = type_infer.to_dataflow()
type_converted_df.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent Missing,Error Count,Empty Count,Unique Values,0.1% Quantile (est.),1% Quantile (est.),5% Quantile (est.),25% Quantile (est.),50% Quantile (est.),75% Quantile (est.),95% Quantile (est.),99% Quantile (est.),99.9% Quantile (est.),Mean,Standard Deviation,Variance,Skewness,Kurtosis
pre,FieldType.INTEGER,0,69,6729.0,0.0,6729.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,13.3831,31.8219,1.13464,2.88915,8.34721,7.23264,91.2244
ACD,FieldType.INTEGER,0,24,6729.0,0.0,6729.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,7.10614,14.3086,0.42354,1.44164,2.07832,5.68882,45.0129
FOUT_avg,FieldType.DECIMAL,0,51.619,6729.0,0.0,6729.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1.98097,4.25431,9.0399,14.6089,27.9123,2.8618,3.39534,11.5283,2.64308,15.2843
FOUT_max,FieldType.INTEGER,0,310,6729.0,0.0,6729.0,0.0,0.0,0.0,108.0,0.0,0.0,0.0,0.0,5.98651,14.4389,36.7944,68.2688,160.139,10.5305,15.5555,241.975,4.93657,53.7618
FOUT_sum,FieldType.INTEGER,0,2168,6729.0,0.0,6729.0,0.0,0.0,0.0,378.0,0.0,0.0,0.0,0.0,12.4743,43.0892,176.002,396.811,944.145,40.7071,87.0531,7578.24,7.43083,106.795
MLOC_avg,FieldType.DECIMAL,0,96.1667,6729.0,0.0,6729.0,0.0,0.0,0.0,,0.0,0.0,0.0,1.0,4.03638,8.04828,16.7199,30.0965,66.0828,5.60118,6.57138,43.1831,3.45249,25.6768
MLOC_max,FieldType.INTEGER,0,994,6729.0,0.0,6729.0,0.0,0.0,0.0,178.0,0.0,0.0,0.0,1.0,10.9743,26.4869,70.6123,131.667,424.06,20.2847,33.7051,1136.04,8.86746,168.153
MLOC_sum,FieldType.INTEGER,0,4559,6729.0,0.0,6729.0,0.0,0.0,0.0,563.0,0.0,0.0,0.0,2.90316,25.5726,81.6852,327.83,744.9,2190.58,79.6597,179.78,32320.9,8.71841,135.981
NBD_avg,FieldType.DECIMAL,0,7,6729.0,0.0,6729.0,0.0,0.0,0.0,694.0,0.0,0.0,0.0,1.0,1.25117,1.67615,2.46051,3.33383,4.7639,1.22939,0.789853,0.623867,0.27986,1.17763
NBD_max,FieldType.INTEGER,0,17,6729.0,0.0,6729.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,1.0,2.0,3.95463,6.0,7.47882,10.0,2.33006,1.88373,3.54844,0.792091,0.842097


# Data preprocesed
# ===============================
# AutoML start

In [10]:
import azureml.core
import pandas as pd
from azureml.core.workspace import Workspace
import logging
import os

In [11]:
ws = Workspace.from_config()
# choose a name for the run history container in the workspace
experiment_name = 'automated-ml-classification'
# project folder
project_folder = './automated-ml-classification/eclipse_2.0/selected_features'

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
Location,canadacentral
Project Directory,./automated-ml-classification/eclipse_2.0/selected_features
Resource Group,ICSE-tutorial
SDK version,1.0.39
Subscription ID,942813ed-26a2-4790-9097-d4be7560fce0
Workspace,workspace_fresher


In [12]:
import azureml.dataprep as dprep
dflow_prepared=type_converted_df
dflow_prepared.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent Missing,Error Count,Empty Count,Unique Values,0.1% Quantile (est.),1% Quantile (est.),5% Quantile (est.),25% Quantile (est.),50% Quantile (est.),75% Quantile (est.),95% Quantile (est.),99% Quantile (est.),99.9% Quantile (est.),Mean,Standard Deviation,Variance,Skewness,Kurtosis
pre,FieldType.INTEGER,0,69,6729.0,0.0,6729.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,13.3831,31.8219,1.13464,2.88915,8.34721,7.23264,91.2244
ACD,FieldType.INTEGER,0,24,6729.0,0.0,6729.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,7.10614,14.3086,0.42354,1.44164,2.07832,5.68882,45.0129
FOUT_avg,FieldType.DECIMAL,0,51.619,6729.0,0.0,6729.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1.98097,4.25431,9.0399,14.6089,27.9123,2.8618,3.39534,11.5283,2.64308,15.2843
FOUT_max,FieldType.INTEGER,0,310,6729.0,0.0,6729.0,0.0,0.0,0.0,108.0,0.0,0.0,0.0,0.0,5.98651,14.4389,36.7944,68.2688,160.139,10.5305,15.5555,241.975,4.93657,53.7618
FOUT_sum,FieldType.INTEGER,0,2168,6729.0,0.0,6729.0,0.0,0.0,0.0,378.0,0.0,0.0,0.0,0.0,12.4743,43.0892,176.002,396.811,944.145,40.7071,87.0531,7578.24,7.43083,106.795
MLOC_avg,FieldType.DECIMAL,0,96.1667,6729.0,0.0,6729.0,0.0,0.0,0.0,,0.0,0.0,0.0,1.0,4.03638,8.04828,16.7199,30.0965,66.0828,5.60118,6.57138,43.1831,3.45249,25.6768
MLOC_max,FieldType.INTEGER,0,994,6729.0,0.0,6729.0,0.0,0.0,0.0,178.0,0.0,0.0,0.0,1.0,10.9743,26.4869,70.6123,131.667,424.06,20.2847,33.7051,1136.04,8.86746,168.153
MLOC_sum,FieldType.INTEGER,0,4559,6729.0,0.0,6729.0,0.0,0.0,0.0,563.0,0.0,0.0,0.0,2.90316,25.5726,81.6852,327.83,744.9,2190.58,79.6597,179.78,32320.9,8.71841,135.981
NBD_avg,FieldType.DECIMAL,0,7,6729.0,0.0,6729.0,0.0,0.0,0.0,694.0,0.0,0.0,0.0,1.0,1.25117,1.67615,2.46051,3.33383,4.7639,1.22939,0.789853,0.623867,0.27986,1.17763
NBD_max,FieldType.INTEGER,0,17,6729.0,0.0,6729.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,1.0,2.0,3.95463,6.0,7.47882,10.0,2.33006,1.88373,3.54844,0.792091,0.842097


In [13]:

#dflow_X = dflow_prepared.keep_columns(['pre','ACD','NBD_avg','NOF_avg','NOM_avg','NOT','NSF_avg','NSM_avg','PAR_avg'])
dflow_X = dflow_prepared.drop_columns('post')

dflow_y = dflow_prepared.keep_columns('post')

In [14]:
from sklearn.model_selection import train_test_split


x_df = dflow_X.to_pandas_dataframe()
y_df = dflow_y.to_pandas_dataframe()

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)
# flatten y_train to 1d array
y_train.values.flatten()

array([False, False, False, ..., False, False, False])

In [16]:
automl_settings = {
    "iteration_timeout_minutes" : 60,
    "iterations" : 100,
    "primary_metric" : 'AUC_weighted',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 100,
    "validation_size": 0.67,
    "enable_voting_ensemble":False,
    "enable_stack_ensemble":False
}

In [17]:
from azureml.train.automl import AutoMLConfig

# local compute 
automated_ml_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automated_ml_errors.log',
                             path = project_folder,
                             X = x_train.values,
                             y = y_train.values.flatten(),
                             **automl_settings)

In [18]:
from azureml.core.experiment import Experiment
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(automated_ml_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_40519728-13ac-45f9-8190-5fe4bab075de
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION 

        80   StandardScalerWrapper XGBoostClassifier        0:00:59       0.8237    0.8328
        81   MaxAbsScaler LightGBM                          0:00:48       0.8280    0.8328
        82   MinMaxScaler LightGBM                          0:00:21       0.8306    0.8328
        83   StandardScalerWrapper LightGBM                 0:01:18       0.7993    0.8328
        84   TruncatedSVDWrapper XGBoostClassifier          0:03:11       0.8209    0.8328
        85   StandardScalerWrapper XGBoostClassifier        0:00:28       0.8329    0.8329
        86   StandardScalerWrapper XGBoostClassifier        0:01:02       0.8291    0.8329
        87   MaxAbsScaler LightGBM                          0:00:45       0.8233    0.8329
        88   MinMaxScaler LightGBM                          0:01:04       0.8230    0.8329
        89   StandardScalerWrapper LightGBM                 0:00:38       0.8192    0.8329
        90   StandardScalerWrapper XGBoostClassifier        0:01:59       0.8118    0.8329

In [None]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

In [None]:


from azureml.widgets import RunDetails
from azureml.train.automl.run import AutoMLRun

experiment=Experiment(ws, experiment_name)

remote_run = AutoMLRun(experiment = experiment, run_id ="AutoML_0aa2549d-3478-4b7a-a196-1dc3e0be864b")

RunDetails(remote_run).show()

In [None]:
# but we don't use voting ensemble
best_run, fitted_model = remote_run.get_output()
print(best_run)

print(fitted_model)


In [None]:
iteration = 91
b_run, b_model = remote_run.get_output(iteration=iteration)
print(b_run)
print(b_model)

In [None]:
type(fitted_model)

In [None]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)


In [None]:
fitted_model.predict

In [None]:

y_predict = fitted_model.predict(x_test.values) 
print(y_predict[:10])

In [None]:
y_actual = y_test.values.flatten().tolist()
y_actual[:10]

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_actual, y_predict)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision ,recall ,fbeta_score ,support=precision_recall_fscore_support(y_actual, y_predict,average='binary')
print('precision:'+str(precision)+'\nrecall:'+str(recall))