### Final: Azure Platform
#### Author: Yifan Gao
#### Group:  7
#### Date:  Dec 8, 2021

## Create a workspace on Microsoft Azure

In [26]:
# !pip install azureml-core==1.35.0 --user

^C


In [2]:
from azureml.core import Workspace
import warnings
warnings.filterwarnings("ignore")
subscription_id = 'd3805069-b2bd-4ab6-ac52-e7fabe76f043' # Copy this from Azure portal
resource_group  = 'Final_Pipeline_1' # Arbitrary name
workspace_name  = 'FinalAzurePipeline' # Arbitrary name

# Create new workspace
# ws = Workspace.create(name=workspace_name,
#                       subscription_id=subscription_id,
#                       resource_group=resource_group,
#                       create_resource_group=True,
#                       location='eastus2'
#                      )

# Load existing workspace
ws = Workspace.get(name=workspace_name,
                      subscription_id=subscription_id,
                      resource_group=resource_group
                     )

WorkspaceException: WorkspaceException:
	Message: Workspace with name 'FinalAzurePipeline' already exists under resource group with name 'Final_Pipeline_1'.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Workspace with name 'FinalAzurePipeline' already exists under resource group with name 'Final_Pipeline_1'."
    }
}

## Create workspace configuration file to authenticate

In [3]:
try:
    wsc = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    wsc.write_config()
    print('Library configuration succeeded')
except:
    print('Workspace not found')

Library configuration succeeded


## Attach the workspace

In [4]:
import azureml.core
from azureml.core import Datastore

ws = Workspace.from_config()

## Set up machine learning resources

In [5]:
# Default datastore 
def_data_store = ws.get_default_datastore()

# # Get the blob storage associated with the workspace
# def_blob_store = Datastore(ws, "workspaceblobstore")

# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")

### Upload data to cloud for future use

In [6]:
import os
from azureml.core import Dataset
import pandas as pd
data_folder = os.getcwd() + '/Data'
# # Add a new feature
# train_df = pd.read_csv(data_folder + "/train.csv")
# test_df = pd.read_csv(data_folder + "/test.csv")
# train_df['Net Delay in Minutes'] = train_df['Arrival Delay in Minutes'] - train_df['Departure Delay in Minutes']
# test_df['Net Delay in Minutes'] = test_df['Arrival Delay in Minutes'] - test_df['Departure Delay in Minutes']
# train_df.to_csv(data_folder + "/train.csv")
# test_df.to_csv(data_folder + "/test.csv")

data_reference_train = def_data_store.upload_files(files=[data_folder + "/train.csv"], target_path='assignment4_train', overwrite=True, show_progress=True)
data_reference_test = def_data_store.upload_files(files=[data_folder + "/test.csv"], target_path='assignment4_test', overwrite=True, show_progress=True)
trainset = Dataset.Tabular.from_delimited_files(path=data_reference_train)
testset = Dataset.Tabular.from_delimited_files(path=data_reference_test)
trainset.register(workspace=ws, name="Assignment4_Train", create_new_version=True) # dataset will get a new version whenever it's registered
testset.register(workspace=ws, name="Assignment4_Test", create_new_version=True)

Uploading an estimated of 1 files
Uploading C:\Users\mars_\Documents\Personal Document\MSCA classes\MSCA 32021 1 Machine Learning Operations\Week 8/Data/train.csv
Uploaded C:\Users\mars_\Documents\Personal Document\MSCA classes\MSCA 32021 1 Machine Learning Operations\Week 8/Data/train.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Uploading an estimated of 1 files
Uploading C:\Users\mars_\Documents\Personal Document\MSCA classes\MSCA 32021 1 Machine Learning Operations\Week 8/Data/test.csv
Uploaded C:\Users\mars_\Documents\Personal Document\MSCA classes\MSCA 32021 1 Machine Learning Operations\Week 8/Data/test.csv, 1 files out of an estimated total of 1
Uploaded 1 files


{
  "source": [
    "('workspaceblobstore', 'assignment4_test')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "69885d54-54a4-445c-bda9-2fccadc5fb46",
    "name": "Assignment4_Test",
    "version": 1,
    "workspace": "Workspace.create(name='Assignment4AzurePipeline', subscription_id='d3805069-b2bd-4ab6-ac52-e7fabe76f043', resource_group='Assignment4_Pipeline_1')"
  }
}

In [7]:
# !pip install azureml-dataset-runtime --upgrade

In [8]:
# !pip install cloudpickle~=1.2.0

## Set up a compute target

In [9]:
from azureml.core.compute import ComputeTarget, AmlCompute

compute_name = "aml-compute"
vm_size = "Standard_DS3_v2"
# vm_size = "Standard_D11"
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
else:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,  # STANDARD_NC6 is GPU-enabled
                                                                min_nodes=0,
                                                                max_nodes=4)
    # create the compute target
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())

Found compute target: aml-compute


## Configure automl

In [10]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 10,
    "experiment_timeout_hours": 0.3,
    "enable_early_stopping": True,
    "primary_metric": 'accuracy',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

In [11]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='classification',
                             debug_log='automated_ml_errors.log',
                             training_data=trainset,
                             test_data=testset,
# for some reason XGBoost has a deep dependency on other packages and it causes error
# so I have to block it. The metrics do not impact as much when excluding XGBoost.
                             blocked_models = ['XGBoostClassifier'], 
                             label_column_name="satisfaction",
                             compute_target = compute_target,
                             **automl_settings)

In [12]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, "Experiment14")
local_run = experiment.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on aml-compute with default configuration
Running on remote compute: aml-compute


Experiment,Id,Type,Status,Details Page,Docs Page
Experiment14,AutoML_15d6cd88-71b1-4b36-b937-2f80dabb78bc,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling 

### Explore the results

When sorting by validation score (interation metric), the top 3 models are: StackEnsemble, MaxAbsScaler/LightGBM and VotingEnsemble. \
When sorting by speed (duration), the top 3 models are: SparseNormalizer/LightGBM, MaxAbsScaler/SGD and MaxAbsScaler/LightGBM.

In [13]:
warnings.filterwarnings("ignore")
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [21]:
!pip3 install azureml-train-automl-runtime==1.36.1

Collecting azureml-train-automl-runtime==1.36.1
  Using cached azureml_train_automl_runtime-1.36.1-py3-none-any.whl (274 kB)
Collecting azureml-automl-core~=1.36.1
  Using cached azureml_automl_core-1.36.1-py3-none-any.whl (221 kB)
Collecting scikit-learn<0.23.0,>=0.19.0
  Using cached scikit_learn-0.22.2.post1-cp37-cp37m-win_amd64.whl (6.5 MB)
Collecting azureml-automl-runtime~=1.36.1
  Using cached azureml_automl_runtime-1.36.1-py3-none-any.whl (2.1 MB)
Collecting pyarrow<4.0.0,>=0.17.0
  Using cached pyarrow-3.0.0-cp37-cp37m-win_amd64.whl (12.6 MB)
Collecting cloudpickle<2.0.0,>=1.1.0
  Using cached cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Collecting numba<0.54.0
  Using cached numba-0.53.1-cp37-cp37m-win_amd64.whl (2.3 MB)
Collecting llvmlite<0.37,>=0.36.0rc1
  Using cached llvmlite-0.36.0-cp37-cp37m-win_amd64.whl (16.0 MB)
Installing collected packages: llvmlite, cloudpickle, scikit-learn, pyarrow, numba, azureml-automl-core, azureml-automl-runtime, azureml-train-automl-runtime


ERROR: Cannot uninstall 'llvmlite'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


### Retrieve the best model

In [17]:
azureml.train.automl.__version__

'1.35.0'

In [16]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)



Run(Experiment: Experiment14,
Id: AutoML_15d6cd88-71b1-4b36-b937-2f80dabb78bc_14,
Type: azureml.scriptrun,
Status: Completed)
None


In [16]:
# !pip install azureml.interpret
# !pip install raiwidgets
# !pip install azureml-train-automl-runtime --user
# !pip install pyparsing==2.4.2

In [18]:
import joblib

joblib.dump(fitted_model, '123.pkl')

['123.pkl']

### Visualize top features

In [19]:
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations
# from azureml.train.automl.automl_explain_utilities import automl_setup_model_explanations
from automl.client.core.common.constants import MODEL_PATH
import joblib
from azureml.interpret import ExplanationClient, MimicWrapper
from raiwidgets import ExplanationDashboard

# # Download the best model from the artifact store
# best_run.download_file(name=MODEL_PATH, output_file_path='model.pkl')

# # Load the AutoML model into memory
# fitted_model = joblib.load('model.pkl')

# Drop the lablled column to get the training set.
X_train = trainset.drop_columns(columns=['satisfaction'])
y_train = trainset.keep_columns(columns=['satisfaction'], validate=True)
# Drop the lablled column to get the testing set.
X_test = testset.drop_columns(columns=['satisfaction'])

automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, X=X_train, 
                                                             X_test=X_test, y=y_train, 
                                                             task='classification')
client = ExplanationClient.from_run(best_run)
raw_explanations = client.download_model_explanation(raw=True)
# Initialize the Mimic Explainer
# explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator,
#                          explainable_model=automl_explainer_setup_obj.surrogate_model, 
#                          init_dataset=automl_explainer_setup_obj.X_transform, run=best_run,
#                          features=automl_explainer_setup_obj.engineered_feature_names, 
#                          feature_maps=[automl_explainer_setup_obj.feature_map],
#                          classes=automl_explainer_setup_obj.classes,
#                          explainer_kwargs=automl_explainer_setup_obj.surrogate_model_params)
# raw_explanations = explainer.explain(['local', 'global'], get_raw=True,
#                                      raw_feature_names=automl_explainer_setup_obj.raw_feature_names,
#                                      eval_dataset=automl_explainer_setup_obj.X_test_transform)
# print(raw_explanations.get_feature_importance_dict())

ExplanationDashboard(raw_explanations, automl_explainer_setup_obj.automl_pipeline, dataset=automl_explainer_setup_obj.X_test_raw)

ModuleNotFoundError: No module named 'azureml.train.automl.runtime'

Note the feature important bar chart above only shows raw features without transformation. To show transformed features, simply set raw=False. The top 5 features are: 'Inflight wifi service', 'Class', 'Type of Travel', 'Online boarding' and 'Customer Type'.

## Automl with top 3 features

In [19]:
# # Add a new feature
# train_df = pd.read_csv(data_folder + "/train.csv")
# test_df = pd.read_csv(data_folder + "/test.csv")
# train_df['Net Delay in Minutes'] = train_df['Arrival Delay in Minutes'] - train_df['Departure Delay in Minutes']
# test_df['Net Delay in Minutes'] = test_df['Arrival Delay in Minutes'] - test_df['Departure Delay in Minutes']
# train_df.to_csv(data_folder + "/train.csv")
# test_df.to_csv(data_folder + "/test.csv")

trainset1 = trainset.keep_columns(columns=['Inflight wifi service','Class','Type of Travel','satisfaction'])
testset1 = testset.keep_columns(columns=['Inflight wifi service','Class','Type of Travel','satisfaction'])
trainset1.register(workspace=ws, name="Assignment3_Train1", create_new_version=True)
testset1.register(workspace=ws, name="Assignment3_Test1", create_new_version=True)

{
  "source": [
    "('workspaceblobstore', 'assignment3_test')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes",
    "KeepColumns"
  ],
  "registration": {
    "id": "d387dc10-0cf1-4d35-96d3-cea6f2af3c43",
    "name": "Assignment3_Test1",
    "version": 1,
    "workspace": "Workspace.create(name='Assignment3AzurePipeline', subscription_id='d3805069-b2bd-4ab6-ac52-e7fabe76f043', resource_group='Assignment3_Pipeline_1')"
  }
}

In [20]:
automl_config1 = AutoMLConfig(task='classification',
                             debug_log='automated_ml_errors.log',
                             training_data=trainset1,
                             test_data=testset1,
# for some reason XGBoost has a deep dependency on other packages and it causes error
# so I have to block it. The metrics do not impact as much when excluding XGBoost.
                             blocked_models = ['XGBoostClassifier'], 
                             label_column_name="satisfaction",
                             compute_target = compute_target,
                             **automl_settings)

In [21]:
warnings.filterwarnings("ignore")
experiment1 = Experiment(ws, "Experiment11")
local_run1 = experiment1.submit(automl_config1, show_output=True)

Submitting remote run.
No run_configuration provided, running on aml-compute with default configuration
Running on remote compute: aml-compute


Experiment,Id,Type,Status,Details Page,Docs Page
Experiment11,AutoML_fd09e4c0-7113-4758-9f30-187cfce35465,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation







Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms

In [22]:
RunDetails(local_run1).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

When sorting by validation score (interation metric), the top 3 models are: MaxAbsScaler/LightGBM, TruncatedSVDWrapper/RandomForest and VotingEnsemble. \
When sorting by speed (duration), the top 3 models are: MaxAbsScaler/SGD, StandardScalerWrapper/RandomForest and MaxAbsScaler/LightGBM.

The highest accuracy score from automl is 0.965 and the corresponding runtime is 1 minute 39 seconds. \
The highest accuracy score from assignment 2 is 0.928 and the corresponding runtime is 6 minute 10 seconds. \
The highest accuracy score from assignment 1 is 0.928 and the corresponding runtime is 1 minute 04 seconds.