In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-177963",
resource_group="aml-quickstarts-177963",
subscription_id="d4ad7261-832d-46b2-b093-22156001df5b")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-177963
Azure region: northcentralus
Subscription id: d4ad7261-832d-46b2-b093-22156001df5b
Resource group: aml-quickstarts-177963


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
cluster_name='valeriy-cluster'

from azureml.core.compute_target import ComputeTargetException
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',max_nodes=4)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling({"C":uniform(0.1, 1)})

# Specify a Policy
policy = BanditPolicy(slack_factor=0.25)
'VC:the distance is 80% then 1/(1+0.25)'

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn("./training",compute_target= cluster, entry_script='./train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=100)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.
'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
from azureml.widgets import RunDetails
run=exp.submit(hyperdrive_config,show_output=True)



In [5]:
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [7]:
import joblib
# Get your best run and save the model from that run.
from sklearn.linear_model import LogisticRegression
best_run=run.get_best_run_by_primary_metric()
hyperparam_best_run=float(best_run.get_details()['runDefinition']['arguments'][1])
print('Best hyperparameter for logit',hyperparam_best_run)
model_to_save_logit=LogisticRegression(C=hyperparam_best_run)
joblib.dump(model_to_save_logit, 'valeriy_best_model_logit.pkl')

Best hyperparameter for logit 0.2289377431776476


['valeriy_best_model_logit.pkl']

In [9]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")


In [10]:
from train import clean_data
from sklearn.model_selection import train_test_split
# Use the clean_data function to clean your data.
import pandas as pd
x, y = clean_data(ds)
'VC: Split the data on training and testing. Note that this is not in the notebook, but it is in the' 
'checklist'
xy_merged=pd.concat([x,y],axis=1)
xy_merged_train,xy_merged_test=train_test_split(xy_merged,test_size=0.30,random_state=10)


In [11]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='regression',
    primary_metric='r2_score',
    training_data=xy_merged_train,
    label_column_name='y',
    n_cross_validations=2)

In [12]:
# Submit your automl run
automl_run=exp.submit(automl_config)
automl_run.wait_for_completion()



Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_a104eea0-d9ae-424d-a394-c8bce96abcec,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


{'runId': 'AutoML_a104eea0-d9ae-424d-a394-c8bce96abcec',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2022-01-23T14:45:38.814351Z',
 'endTimeUtc': '2022-01-23T15:11:36.215532Z',
 'services': {},
   'message': 'No scores improved over last 20 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'r2_score',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'regression',
  'dependencies_versions': '{"azureml-widgets": "1.37.0", "azureml-train": "1.37.0", "azureml-train-restclients-hyperdrive": "1.37.0", "azureml-train-core": "1.37.0

In [13]:
# Retrieve and save your best automl model.
best_run_automl, fitted_model_automl=automl_run.get_output()
joblib.dump(fitted_model_automl, 'valeriy_best_fitted_model_automl.pkl')

['valeriy_best_fitted_model_automl.pkl']

In [14]:
#Compare the two models
from  sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=10)

model_to_save_logit.fit(X=x_train, y=y_train)
y_pred_regression = model_to_save_logit.predict(X=x_test)
rmse_regression = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_regression))

y_pred_automl = fitted_model_automl.predict(x_test)
rmse_automl = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_automl))
print('rmse, regression:',rmse_regression,'automl:',rmse_automl)

rmse, regression: 0.2983685194420339 automl: 0.23504610042520246


In [18]:
#Delete cluster
AmlCompute.delete(cluster)