In [1]:
import logging
import json

In [2]:
pip install azureml-core

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install cryptography 



In [2]:
from azureml.core import Workspace, Experiment


In [3]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')



Workspace name: mlcloud
Azure region: eastus2
Subscription id: d990bb6c-7849-4109-9dd7-6cafa051c8ae
Resource group: mlcloud


In [4]:
import os

script_folder = './scripts'
os.makedirs(script_folder, exist_ok=True)



In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.exceptions import ComputeTargetException


### Create a compute cluster using the SDK.

In [6]:
# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
cluster_name = "c003"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target {}.'.format(cluster_name))
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2",
                                                               max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)

print("Azure Machine Learning Compute attached")



Found existing compute target c003.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
Azure Machine Learning Compute attached


### Use HyperDrive to automatically find optimal parameters.

##

In [9]:
pip install azureml-widgets

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install --upgrade jinja2


Collecting jinja2
  Using cached jinja2-3.1.6-py3-none-any.whl (134 kB)
Installing collected packages: jinja2
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 3.1.4
    Uninstalling Jinja2-3.1.4:
      Successfully uninstalled Jinja2-3.1.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab-topbar 0.6.1 requires jupyterlab~=3.0, but you have jupyterlab 4.0.11 which is incompatible.
jupyterlab-system-monitor 0.8.0 requires jupyterlab~=3.0, but you have jupyterlab 4.0.11 which is incompatible.
dask-sql 2024.5.0 requires dask[dataframe]>=2024.4.1, but you have dask 2023.2.0 which is incompatible.
dask-sql 2024.5.0 requires distributed>=2024.4.1, but you have distributed 2023.2.0 which is incompatible.
dask-sql 2024.5.0 requires pandas>=1.4.0, but you have pandas 1.3.5 which is incompatible.
azureml-defaults 1.57.0 requires azureml

In [11]:
pip install azureml-train

Note: you may need to restart the kernel to use updated packages.


In [7]:
from azureml.widgets import RunDetails


In [8]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os


In [9]:
# Specify parameter sampler
#ps = ### YOUR CODE HERE ###

ps = RandomParameterSampling(
    {
        '--C': uniform(0.01, 0.1),
        '--max_iter': choice([50, 100, 200])

    }
)


# Specify a Policy
# policy = ### YOUR CODE HERE ###
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
# src = ### YOUR CODE HERE ###
# passing parameters to the training script

src = ScriptRunConfig(
    source_directory=script_folder,
    script="train.py",
    compute_target="c003",  # Replace with your compute target
    environment=sklearn_env,
    arguments=[
        "--C", 0.1,
        "--max_iter", 200,
        "--test_train_ratio", 0.25
    ]
)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
# hyperdrive_config = ### YOUR CODE HERE ###

hyperdrive_config =HyperDriveConfig(run_config=src, 
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name='Accuracy', 
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                             max_total_runs=10,
                             max_concurrent_runs=2)

In [10]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###

hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
print("Submitted HyperDrive run id:", hyperdrive_run.id)
hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")

2025-03-26 21:26:15.080788: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-26 21:26:15.234672: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-26 21:26:15.281353: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-26 21:26:15.599906: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

Submitted HyperDrive run id: HD_eea178ad-0681-4753-a6db-b384727204e5
RunId: HD_eea178ad-0681-4753-a6db-b384727204e5
Web View: https://ml.azure.com/runs/HD_eea178ad-0681-4753-a6db-b384727204e5?wsid=/subscriptions/d990bb6c-7849-4109-9dd7-6cafa051c8ae/resourcegroups/mlcloud/workspaces/mlcloud&tid=4c460090-f86f-4a64-9b48-ed4b89bbef7b

Streaming azureml-logs/hyperdrive.txt

[2025-03-26T21:26:07.8789891Z][GENERATOR][DEBUG]Sampled 2 jobs from search space 
[2025-03-26T21:26:08.2236687Z][SCHEDULER][INFO]Scheduling job, id='HD_eea178ad-0681-4753-a6db-b384727204e5_0' 
[2025-03-26T21:26:08.3054536Z][SCHEDULER][INFO]Scheduling job, id='HD_eea178ad-0681-4753-a6db-b384727204e5_1' 
[2025-03-26T21:26:08.6658152Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_eea178ad-0681-4753-a6db-b384727204e5_1' 
[2025-03-26T21:26:08.7636397Z][SCHEDULER][INFO]Successfully scheduled a job. Id='HD_eea178ad-0681-4753-a6db-b384727204e5_0' 
[2025-03-26T21:28:43.8815001Z][GENERATOR][DEBUG]Sampled 2 jobs from search

In [11]:
  !pip show azureml-widgets
  !pip show azureml-core

Name: azureml-widgets
Version: 1.57.0
Summary: Provides fully supported, with interactivity, async auto-updates, and non-blocking cell execution.
Home-page: https://docs.microsoft.com/python/api/overview/azure/ml/?view=azure-ml-py
Author: Microsoft Corp
Author-email: 
License: https://aka.ms/azureml-sdk-license
Location: /anaconda/envs/azureml_py38/lib/python3.10/site-packages
Requires: azure-storage-blob, azureml-core, azureml-telemetry, ipywidgets, jinja2
Required-by: 
Name: azureml-core
Version: 1.57.0
Summary: Azure Machine Learning core packages, modules, and classes
Home-page: https://docs.microsoft.com/python/api/overview/azure/ml/?view=azure-ml-py
Author: Microsoft Corp
Author-email: 
License: https://aka.ms/azureml-sdk-license
Location: /anaconda/envs/azureml_py38/lib/python3.10/site-packages
Requires: adal, argcomplete, azure-common, azure-core, azure-graphrbac, azure-mgmt-authorization, azure-mgmt-containerregistry, azure-mgmt-keyvault, azure-mgmt-network, azure-mgmt-resourc

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



### 

### Retrieve the best run

In [220]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###

best_run = hyperdrive_run.get_best_run_by_primary_metric()
print("Best Run ID:", best_run.id)
print("Best Run Metrics:", best_run.get_metrics())

Best Run ID: HD_06e9be6d-7868-4a9e-9c4a-fb708722726f_0
Best Run Metrics: {'Regularization Strength:': 0.0220308670897337, 'Max iterations:': 100, 'Accuracy': 0.9093226511289147}


In [221]:
# Download the model files
from azureml.core import Model

best_run.download_file(name="outputs/model.pkl", output_file_path="best_model.pkl") #adjust path if needed.

# Register the model
model = Model.register(
    workspace=ws,
    model_path="best_model.pkl",  # Local path to the downloaded model
    model_name="best-logistic-regression-model",
    tags={"Training context": "HyperDrive"},
    description="Best Logistic Regression Model from Optimizing an ML Pipeline in Azure",
)

print("Registered Model ID:", model.id)

Registering model best-logistic-regression-model
Registered Model ID: best-logistic-regression-model:2


### Import data to a Dataset using the SDK.

In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###

data_url = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv' 
dataset = TabularDatasetFactory.from_delimited_files(data_url)
df = dataset.to_pandas_dataframe()
print(df.head())  # Display the first few rows of the dataframe

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
   age          job  marital    education  default housing loan    contact  \
0   57   technician  married  high.school       no      no  yes   cellular   
1   55      unknown  married      unknown  unknown     yes   no  telephone   
2   33  blue-collar  married     basic.9y       no      no   no   cellular   
3   36       admin.  married  high.school       no      no   no  telephone   
4   27    housemaid  married  high.school       no     yes   no   cellular   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         1      failure         -1.8   
1   may         thu  ...         2    999         0  nonexistent          1.1   
2   may         fri  ...         1    999         1      failure         -1.8   
3   jun         fri  ...         4    99

In [9]:
from scripts.train import clean_data
# Use the clean_data function to clean your data.
X,y = clean_data(dataset)

{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 0 to 32949
Data columns (total 39 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            32950 non-null  int64  
 1   marital                        32950 non-null  int64  
 2   default                        32950 non-null  int64  
 3   housing                        32950 non-null  int64  
 4   loan                           32950 non-null  int64  
 5   month                          32950 non-null  int64  
 6   day_of_week                    32950 non-null  int64  
 7   duration                       32950 non-null  int64  
 8   campaign                       32950 non-null  int64  
 9   pdays                          32950 non-null  int64  
 10  previous                       32950 non-null  int64  
 11  poutcome                       32950 non-null  int64  
 12  emp.var.rate                   32950 non-null 

In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [10]:
import pandas as pd
from azureml.core.dataset import Dataset


In [11]:
combined_df = pd.concat([X, y], axis=1)



In [12]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 0 to 32949
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            32950 non-null  int64  
 1   marital                        32950 non-null  int64  
 2   default                        32950 non-null  int64  
 3   housing                        32950 non-null  int64  
 4   loan                           32950 non-null  int64  
 5   month                          32950 non-null  int64  
 6   day_of_week                    32950 non-null  int64  
 7   duration                       32950 non-null  int64  
 8   campaign                       32950 non-null  int64  
 9   pdays                          32950 non-null  int64  
 10  previous                       32950 non-null  int64  
 11  poutcome                       32950 non-null  int64  
 12  emp.var.rate                   32950 non-null 

In [13]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(combined_df, test_size=0.25, random_state=42)


In [14]:
print(train_df.shape, test_df.shape)

(24712, 40) (8238, 40)


In [15]:
training_dataset = Dataset.Tabular.register_pandas_dataframe(
    dataframe=train_df,
    target=ws.get_default_datastore(),
    name='training_data',
    show_progress=True
)

Validating arguments.
Arguments validated.
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'emp.var.rate' -> 'emp_var_rate'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.price.idx' -> 'cons_price_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.conf.idx' -> 'cons_conf_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'nr.employed' -> 'nr_employed'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'job_admin.' -> 'job_admin_'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'education_basic.4y' -> 'education_basic_4y'
Column header contains '.' This period will be translated to '_' as we write the data out to parque

In [16]:
val_dataset = Dataset.Tabular.register_pandas_dataframe(
    dataframe=test_df,
    target=ws.get_default_datastore(),
    name='val_data',
    show_progress=True
)

Validating arguments.
Arguments validated.
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'emp.var.rate' -> 'emp_var_rate'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.price.idx' -> 'cons_price_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.conf.idx' -> 'cons_conf_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'nr.employed' -> 'nr_employed'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'job_admin.' -> 'job_admin_'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'education_basic.4y' -> 'education_basic_4y'
Column header contains '.' This period will be translated to '_' as we write the data out to parque

### Create an AutoMLConfig for training.

In [17]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_settings = {
    "experiment_timeout_minutes": 30,
    "enable_early_stopping": True,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    # "n_cross_validations": 2,
    "primary_metric": "accuracy",
    "featurization": "auto",
    "verbosity": logging.INFO,
    "enable_code_generation": True,
}

automl_config = AutoMLConfig(
    task="classification",
    debug_log="automl_errors.log",
    compute_target="c003",
    experiment_exit_score=0.9984,
    blocked_models=["KNN", "LinearSVM"],
    enable_onnx_compatible_models=True,
    training_data=training_dataset,
    label_column_name='y',
    validation_data=val_dataset,
    **automl_settings,
)

In [18]:
# Submit your automl run

### YOUR CODE HERE ###

remote_run = exp.submit(automl_config, show_output=False)


Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_e2c203f5-5606-4267-aa45-fc72d144b495,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


In [None]:
from azureml.widgets import RunDetails

RunDetails(remote_run).show()

In [202]:
# Wait for the remote run to complete
remote_run.wait_for_completion()

{'runId': 'AutoML_17a5bbfd-6834-40c0-9050-e180d725bdc9',
 'target': 'c003',
 'status': 'Completed',
 'startTimeUtc': '2025-03-25T19:55:57.756536Z',
 'endTimeUtc': '2025-03-25T20:26:17.158137Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'c003',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"cf1028a1-aed4-40b5-9d72-7ca15bb3e8fe\\"}, \\"validation_data\\": {\\"datasetId\\": \\"ca56fa1b-9ed1-441a-bf06-fd7b76c6081d\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task

In [203]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###

best_automl_run = remote_run.get_best_child()

In [204]:
best_automl_run.download_file(name="outputs/model.pkl", output_file_path="best_model_automl.pkl") #adjust path if needed.

# Register the model
model = Model.register(
    workspace=ws,
    model_path="best_model_automl.pkl",  # Local path to the downloaded model
    model_name="best-automl-model",
    tags={"Training context": "AutoML"},
    description="Best AutoML Model from Optimizing an ML Pipeline in Azure",
)

print("Registered Model ID:", model.id)

Registering model best-automl-model
Registered Model ID: best-automl-model:1


In [9]:
from azureml.train.automl.run import AutoMLRun


In [10]:
automl_run = AutoMLRun(experiment=exp, run_id="AutoML_17a5bbfd-6834-40c0-9050-e180d725bdc9")

# Get the best model's run
best_run, fitted_model = automl_run.get_output()



In [14]:
from azureml.widgets import RunDetails

RunDetails(best_run).show()


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


In [12]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_17a5bbfd-6834-40c0-9050-e180d725bdc9_45,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [13]:
best_run.properties

{'runTemplate': 'automl_child',
 'pipeline_id': '__AutoML_Ensemble__',
 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'udacity-project\',\'compute_target\':\'c003\',\'subscription_id\':\'d990bb6c-7849-4109-9dd7-6cafa051c8ae\',\'region\':\'eastus2\',\'spark_service\':None}","ensemble_run_id":"AutoML_17a5bbfd-6834-40c0-9050-e180d725bdc9_45","experiment_name":"udacity-project","workspace_name":"mlcloud","subscription_id":"d990bb6c-7849-4109-9dd7-6cafa051c8ae","resource_group_name":"mlcloud"}}]}',
 'training_percent': '100',
 'predicted_cost': None,
 'iteration': '45',
 '_aml_system_scenario_identification': 'Remote.Child',
 '_azureml.ComputeTargetType': 'amlctrain',
 '_azureml.ClusterName':

### Clean up deployed resources.



In [222]:
# Delete the compute target
compute_target.delete()