# Automated ML

In [1]:
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.automl.core.forecasting_parameters import ForecastingParameters

## Dataset

In [2]:
# Get workspace
ws = Workspace.from_config()

# Create experiment
experiment = Experiment(ws, 'automl-energyforecast')
experiment

Name,Workspace,Report Page,Docs Page
automl-energyforecast,quick-starts-ws-143075,Link to Azure Machine Learning studio,Link to Documentation


In [3]:
# Verify that the dataset is imported successfully
for x in ws.datasets.keys():
    print(x)

energy-forecast-data-validation
energy-forecast-data-training


In [4]:
# Get data
data_valid = ws.datasets['energy-forecast-data-validation']
data_train = ws.datasets['energy-forecast-data-training']

## AutoML Configuration

In [5]:
# Create compute cluster
amlcompute_cluster_name = 'aml-compute'

try:
    compute_target = ComputeTarget(
        workspace=ws,
        name=amlcompute_cluster_name
    )
    print('Existing cluster found and is now selected')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_DS12_V2',
        max_nodes=6
    )
    compute_target = ComputeTarget.create(
        ws,
        amlcompute_cluster_name,
        compute_config
    )
    compute_target.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


Explanation of the configuration:
- `task='forecasting'`: This is because we're interested in predicting future electricity load, instead of simply inferencing electricity load based on the available variables like humidity, temperature.
- `enable_early_stopping=True`: This is to reduce the likelihood of overfitting.
- `featurization='auto'`: This is the main strength of AutoML. We'll let it carry out feature engineering automatically
- `experiment_timeout_minutes=15`: Mainly because I was pressed on time hence can't afford for the experiment to run too long. This is added on top of the preparation steps which already took ~45 minutes, increasing the experiment run time to ~1hr.
- `primary_metric='r2_score'`: While `RMSE` would do just fine as a primary metric, R2 score is just easier to analyse by seeing how close it is to 1.
- `forecast_horizon=48, target_lags=48`: We're interested in predicting the load for the next day.


In [6]:
# Specify settings for AutoML runs
automl_config = AutoMLConfig(
    compute_target=compute_target,
    task='forecasting',
    training_data=data_train,
    validation_data=data_valid,
    label_column_name='TOTAL Load',
    time_column_name='DATE',
    # path=project_folder,
    enable_early_stopping=True,
    featurization='auto',
    debug_log = 'automl_errors.log',
    experiment_timeout_minutes=15,
    max_concurrent_iterations=10,
    primary_metric='r2_score',
    forecast_horizon=48,
    target_lags=48
)

In [7]:
# Submit AML experiment
remote_run = experiment.submit(automl_config, show_output=True)
remote_run.wait_for_completion()

Running on remote.
No run_configuration provided, running on aml-compute with default configuration
Running on remote compute: aml-compute
Parent Run ID: AutoML_860b8b2a-c6bc-4a92-a016-a5cf99c17075

Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling of missing feature values that may be more appropriate based on the data type and business requirement.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization
DETAILS:      
+---------------------------------

{'runId': 'AutoML_860b8b2a-c6bc-4a92-a016-a5cf99c17075',
 'target': 'aml-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-04-18T18:35:43.01294Z',
 'endTimeUtc': '2021-04-18T19:00:11.713309Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'r2_score',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'aml-compute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"f3f4bfa9-1446-44f4-9b99-5dbbd2aca2a3\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/training.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-143075\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"cdbe0b43-92a0-4715-838a-f2648cc7ad21\\\\\\", \\\\\\"worksp

## Run Details

In [8]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Displaying the best model

The best model generated by AutoML is a SoftVottingRegressor, which is an ensemble of 4 models: 3 `XGBoostRegressor`s, and 1 `ElasticNet`. It managed to achieve an R2 score of 0.75, which was inferior to what achieved by the HyperDrive run. 

I believe the main cause was that it couldn't make sense of the `SUNRISE` and `SUNSET` columns in the dataset, therefore they weren't utilised (or not utilised correctly.)

In [9]:
# Retrieve and save the best automl model.
aml_best_run, aml_model = remote_run.get_output()

aml_best_run.download_file('outputs/model.pkl', 'outputs/aml_model.pkl')

Package:azureml-automl-runtime, training version:1.25.0, current version:1.19.0
Package:azureml-core, training version:1.25.0, current version:1.19.0
Package:azureml-dataprep, training version:2.11.2, current version:2.6.3
Package:azureml-dataprep-native, training version:30.0.0, current version:26.0.0
Package:azureml-dataprep-rslex, training version:1.9.1, current version:1.4.0
Package:azureml-dataset-runtime, training version:1.25.0, current version:1.19.0.post1
Package:azureml-defaults, training version:1.25.0, current version:1.19.0
Package:azureml-interpret, training version:1.25.0, current version:1.19.0
Package:azureml-telemetry, training version:1.25.0, current version:1.19.0
Package:azureml-train-automl-client, training version:1.25.0, current version:1.19.0
Package:azureml-train-automl-runtime, training version:1.25.0, current version:1.19.0
Package:azureml-pipeline-core, training version:1.25.0


In [16]:
# Show performance
print(aml_best_run)

Run(Experiment: automl-energyforecast,
Id: AutoML_860b8b2a-c6bc-4a92-a016-a5cf99c17075_16,
Type: azureml.scriptrun,
Status: Completed)


In [15]:
print(aml_best_run.get_metrics())

{'root_mean_squared_error': 43921.24875527575, 'median_absolute_error': 21911.096004516352, 'normalized_root_mean_squared_log_error': 0.05339412816608649, 'explained_variance': 0.7510450991692319, 'mean_absolute_percentage_error': 12.011923539030041, 'normalized_median_absolute_error': 0.047515062680566315, 'normalized_root_mean_squared_error': 0.09524493376257916, 'r2_score': 0.7509110923727431, 'normalized_mean_absolute_error': 0.06597407954390291, 'mean_absolute_error': 30423.28704087539, 'spearman_correlation': 0.864532904131909, 'root_mean_squared_log_error': 0.17254911921393584, 'residuals': 'aml://artifactId/ExperimentRun/dcid.AutoML_860b8b2a-c6bc-4a92-a016-a5cf99c17075_16/residuals', 'predicted_true': 'aml://artifactId/ExperimentRun/dcid.AutoML_860b8b2a-c6bc-4a92-a016-a5cf99c17075_16/predicted_true'}


In [11]:
print(aml_model)

ForecastingPipelineWrapper(pipeline=Pipeline(memory=None,
                                             steps=[('timeseriestransformer',
                                                     TimeSeriesTransformer(featurization_config=None,
                                                                           pipeline_type=<TimeSeriesPipelineType.FULL: 1>)),
                                                    ('prefittedsoftvotingregressor',
                                                     PreFittedSoftVotingRegressor(estimators=[('0',
                                                                                               Pipeline(memory=None,
                                                                                                        steps=[('sparsenormalizer',
                                                                                                                <azureml.automl.runtime.shared.model_wr...
                                                

In [12]:
aml_model.steps[1][1]

PreFittedSoftVotingRegressor(estimators=[('0',
                                          Pipeline(memory=None,
                                                   steps=[('sparsenormalizer',
                                                           <azureml.automl.runtime.shared.model_wrappers.SparseNormalizer object at 0x000001A2C2D61F48>),
                                                          ('xgboostregressor',
                                                           XGBoostRegressor(base_score=0.5,
                                                                            booster='gbtree',
                                                                            colsample_bylevel=1,
                                                                            colsample_bynode=1,
                                                                            colsample_bytree=0.7,
                                                                            eta=0.3,
               

In [13]:
aml_best_run.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":6,"CurrentNodeCount":6}',
 'mlflow.source.type': 'JOB',
 'mlflow.source.name': 'automl_driver.py',
 'ensembled_iterations': '[0, 6, 5, 7]',
 'ensembled_algorithms': "['XGBoostRegressor', 'ExtremeRandomTrees', 'XGBoostRegressor', 'ElasticNet']",
 'ensemble_weights': '[0.7142857142857143, 0.14285714285714285, 0.07142857142857142, 0.07142857142857142]',
 'best_individual_pipeline_score': '0.7336744027226058',
 'best_individual_iteration': '0',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True'}