In [None]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="udacity-project")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [8]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
cluster_name='valeriy-cluster'

from azureml.core.compute_target import ComputeTargetException
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',max_nodes=4)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)


In [18]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling({"C":uniform(0.1, 1)})

# Specify a Policy
policy = BanditPolicy(slack_factor=0.25)
'VC:the distance is 80% then 1/(1+0.25)'

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn("./training",compute_target= cluster, entry_script='./train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=100)



In [19]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
from azureml.widgets import RunDetails
run=exp.submit(hyperdrive_config,show_output=True)

2021-12-30:20:19:49,486 INFO     [log.py:114] 52126bd3-e0f5-4bf0-8f91-5e92acddd98f - CacheDriver:Cached token is expired at 2021-12-30 20:24:40.019257.  Refreshing
2021-12-30:20:19:49,487 INFO     [log.py:114] 52126bd3-e0f5-4bf0-8f91-5e92acddd98f - TokenRequest:Getting a new token from a refresh token
2021-12-30:20:19:49,598 INFO     [log.py:114] 52126bd3-e0f5-4bf0-8f91-5e92acddd98f - CacheDriver:Returning token refreshed after expiry.
2021-12-30:21:23:56,810 INFO     [log.py:114] acdaf483-8643-4e12-9c69-0a4241a9ddbc - CacheDriver:Cached token is expired at 2021-12-30 21:28:23.598718.  Refreshing
2021-12-30:21:23:56,811 INFO     [log.py:114] acdaf483-8643-4e12-9c69-0a4241a9ddbc - TokenRequest:Getting a new token from a refresh token
2021-12-30:21:23:56,989 INFO     [log.py:114] acdaf483-8643-4e12-9c69-0a4241a9ddbc - CacheDriver:Returning token refreshed after expiry.


In [28]:
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [27]:
import joblib
# Get your best run and save the model from that run.
from sklearn.linear_model import LogisticRegression
best_run=run.get_best_run_by_primary_metric()
hyperparam_best_run=float(best_run.get_details()['runDefinition']['arguments'][1])
print('Best hyperparameter for logit',hyperparam_best_run)
model_to_save_logit=LogisticRegression(C=hyperparam_best_run)
joblib.dump(model_to_save_logit, 'valeriy_best_model_logit.pkl')

Best hyperparameter for logit 0.8019898401823904


['valeriy_best_model_logit.pkl']

In [1]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")


In [2]:
from train import clean_data
from sklearn.model_selection import train_test_split
# Use the clean_data function to clean your data.
import pandas as pd
x, y = clean_data(ds)
'VC: Split the data on training and testing. Note that this is not in the notebook, but it is in the' 
'checklist'
xy_merged=pd.concat([x,y],axis=1)
xy_merged_train,xy_merged_test=train_test_split(xy_merged,test_size=0.30,random_state=10)


In [10]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='regression',
    primary_metric='r2_score',
    training_data=xy_merged_train,
    label_column_name='y',
    n_cross_validations=2)

In [11]:
# Submit your automl run
automl_run=exp.submit(automl_config)
automl_run.wait_for_completion()

2021-12-30:19:04:05,516 INFO     [modeling_bert.py:226] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2021-12-30:19:04:05,529 INFO     [modeling_xlnet.py:339] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2021-12-30:19:04:27,263 INFO     [utils.py:157] NumExpr defaulting to 4 threads.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_0ca18977-1c6b-480f-bdb3-8885948766f0,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


2021-12-30:19:25:31,366 INFO     [explanation_client.py:332] Using default datastore for uploads


{'runId': 'AutoML_0ca18977-1c6b-480f-bdb3-8885948766f0',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-12-30T19:04:28.545448Z',
 'endTimeUtc': '2021-12-30T19:24:14.285113Z',
 'services': {},
   'message': 'No scores improved over last 20 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'r2_score',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'regression',
  'dependencies_versions': '{"azureml-widgets": "1.36.0", "azureml-train": "1.36.0", "azureml-train-restclients-hyperdrive": "1.36.0", "azureml-train-core": "1.36.0

In [17]:
# Retrieve and save your best automl model.
best_run_automl, fitted_model_automl=automl_run.get_output()
joblib.dump(fitted_model_automl, 'valeriy_best_fitted_model_automl.pkl')

['valeriy_best_fitted_model_automl.pkl']

In [26]:
#Compare the two models
from  sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=10)

model_to_save_logit.fit(X=x_train, y=y_train)
y_pred_regression = model_to_save_logit.predict(X=x_test)
rmse_regression = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_regression))

y_pred_automl = fitted_model_automl.predict(x_test)
rmse_automl = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_automl))
print('rmse, regression:',rmse_regression,'automl:',rmse_automl)

rmse, regression: 0.29683885019370304 automl: 0.23516642976204288


In [None]:
#Delete cluster
ComputeTarget.delete()