# Hyperparameter Tuning using HyperDrive

In [None]:
from azureml.core import Workspace, Experiment,Dataset
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice

import os
import joblib

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Initialize Workspace and Create an Azure ML experiment

In [None]:
ws = Workspace.from_config()
experiment_name = 'heart-failure-hyperdrive-experiment'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

## Dataset

In [None]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
dataset_key = "Heart-Failure-Prediction-Dataset"
description_text = "Heart Failure Prediction Dataset"

if dataset_key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[dataset_key]
else:
    # Create AML Dataset and register it into Workspace
    dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv'
    dataset = Dataset.Tabular.from_delimited_files(dataset_url)        
    #Register Dataset in Workspace
    dataset = dataset.register(workspace=ws,
                                name=key,
                                description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

## Create or Attach an AmlCompute cluster

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "cpu-cluster"

# Check if the compute target exists
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except:
    # If not, create it
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=5)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)

print(compute_target.get_status().serialize())

## Create conda dependencies file 

In [None]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- numpy
- pip:
  - azureml-defaults

## Create sklearn environment

In [None]:
from azureml.core import Environment
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [None]:
# Create an early termination policy. This is not required if you are using Bayesian sampling.
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval = 1, delay_evaluation=5)

#Create the different params that you will be using during training
ps = RandomParameterSampling(
    {
        "--C" :        choice(0.001,0.01,0.1, 0.5, 1,1.5,10,20,50,100,200,500,1000),
        "--max_iter" : choice(25,50,75,100,200,300)
    }
)

#Create estimator and hyperdrive config
src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      compute_target=compute_aml_cluster,
                      environment=sklearn_env)

hyperdrive_run_config =  HyperDriveConfig(
    hyperparameter_sampling = ps, 
    policy = early_termination_policy,
    primary_metric_name = 'Accuracy',
    primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
    max_total_runs = 100,
    max_concurrent_runs = 4,
    run_config = src
)

In [None]:
#Submit experiment
hyperdrive_run = exp.submit(hyperdrive_config, show_output=True)

## Run Details

In [None]:
RunDetails(hyperdrive_run).show()

In [None]:
hyperdrive_run.get_status()

In [None]:
hyperdrive_run.wait_for_completion(show_output=True)

## Best Model

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print('Best Run Id:', best_run.id, sep='\n')
print('Best Run Metrics:', best_run.get_metrics(), sep='\n')
print('Best Run Properties:', best_run.get_properties(), sep='\n')
print('Best Run Parameters:', best_run.get_details()['runDefinition']['arguments'], sep='\n')
print('Best Run File names:', best_run.get_file_names(), sep='\n')

In [None]:
#Save the best model
hyperdrive_model = best_run.register_model(model_name = 'hyperdrive_model', model_path = './outputs/model.joblib')
print(best_run)

In [None]:
best_run.download_file("/outputs/model.joblib", "./outputs/model.joblib")

## Cleanup

In [None]:
compute_target.delete()