In [None]:
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cpu-cluster"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2', 
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

In [None]:
# Specify parameter sampler
ps = RandomParameterSampling({
    "--C": choice(0.01, 0.1, 1, 10, 100),
    "--max_iter": choice(50, 100, 150, 200)
})

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Create a ScriptRunConfig 
# Note: Ensure 'train.py' exists in your directory
src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      compute_target=cpu_cluster,
                      environment=sklearn_env)

# Create a HyperDriveConfig
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=4)

In [None]:
hyperdrive_run = exp.submit(hyperdrive_config)
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['Accuracy'])

# Save the model
model = best_run.register_model(model_name='hyperdrive_best_model', model_path='outputs/model.joblib')

In [None]:
path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=path)

# Cleaning data (assuming clean_data returns a dataframe with the label)
from train import clean_data
df = ds.to_pandas_dataframe()
x, y = clean_data(df) # Ensure your clean_data function is structured to return X and y

# For AutoML, it's often easiest to pass the combined cleaned dataframe
import pandas as pd
train_df = pd.concat([x, y], axis=1)

In [None]:
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=train_df,
    label_column_name='y', # Adjust this to your actual label column name
    n_cross_validations=5,
    compute_target=cpu_cluster,
    enable_early_stopping=True
)

# Submit AutoML run
remote_run = exp.submit(automl_config, show_output=True)

In [None]:
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)

# Save the best model
joblib.dump(fitted_model, 'outputs/automl_model.pkl')