In [1]:
from azureml.core import Workspace, Experiment

# Configure experiment
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="titanic-lgbm")

In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

def get_aml_cluster(ws, cluster_name, vm_size='STANDARD_D2_V2', max_nodes=4):
    try:
        cluster = ComputeTarget(workspace=ws, name=cluster_name)
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=max_nodes)
        cluster = ComputeTarget.create(ws, cluster_name, compute_config)
    cluster.wait_for_completion(show_output=True)    
    return cluster

In [3]:
aml_cluster = get_aml_cluster(ws, cluster_name="amldemocompute")

Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('data/train.csv')
df.drop(['PassengerId'], axis=1, inplace=True)

# 'Embarked' is stored as letters, so fit a label encoder to the train set to use in the loop
embarked_encoder = LabelEncoder()
embarked_encoder.fit(df['Embarked'].fillna('Null'))
 
# Record anyone travelling alone
df['Alone'] = (df['SibSp'] == 0) & (df['Parch'] == 0)

# Transform 'Embarked'
df['Embarked'].fillna('Null', inplace=True)
df['Embarked'] = embarked_encoder.transform(df['Embarked'])

# Transform 'Sex'
df.loc[df['Sex'] == 'female','Sex'] = 0
df.loc[df['Sex'] == 'male','Sex'] = 1
df['Sex'] = df['Sex'].astype('int8')

# Drop features that seem unusable. Save passenger ids if test
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
import os
from azureml.core import Dataset

def df_to_dataset(ws, df, name, data_dir='./data'):
    data_path = os.path.join(data_dir, "%s.csv" % name)
    
    # save data to disk
    df.to_csv(data_path)
    
    # get the default datastore
    datastore = ws.get_default_datastore()

    # upload the data to the datastore
    datastore.upload(src_dir=data_dir, target_path=data_dir)
    
    # create a dataset
    dataset = Dataset.Tabular.from_delimited_files(datastore.path(data_path))
    
    # register the dataset
    dataset.register(workspace=ws, name=name, create_new_version=True)
    return dataset

In [6]:
df_to_dataset(ws, df, 'titanic_cleaned')

Uploading an estimated of 2 files
Target already exists. Skipping upload for data/titanic_cleaned.csv
Target already exists. Skipping upload for data/train.csv
Uploaded 0 files


{
  "source": [
    "('workspaceblobstore', './data/titanic_cleaned.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [7]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
 
def run_config(target, packages=None):
    packages = packages or []
    config = RunConfiguration()

    config.target = target
    config.environment.docker.enabled = True
    config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    
    azureml_pip_packages = [
        'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',
        'azureml-interpret', 'sklearn-pandas', 'azureml-dataprep'
    ]
    
    config.auto_prepare_environment = True
    config.environment.python.user_managed_dependencies = False
    config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=azureml_pip_packages + packages)
    
    return config

In [12]:
# Create a remote run configuration
run_amlcompute = run_config(aml_cluster, [
    'numpy', 'pandas', 'matplotlib', 'seaborn', 'scikit-learn', 'lightgbm', 'umap-learn'
])



In [13]:
script_params = [
    '--boosting', 'dart',
    '--learning-rate', '0.05',
    '--drop-rate', '0.15',
]

In [14]:
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

script = 'train_lightgbm.py'
script_folder = os.getcwd()

src = ScriptRunConfig(
  source_directory=script_folder,
  script=script,
  run_config=run_amlcompute,
  arguments=script_params)

run = exp.submit(src)

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [15]:
print(run.get_portal_url())

https://ml.azure.com/experiments/titanic-lgbm/runs/titanic-lgbm_1587937268_952a5078?wsid=/subscriptions/21dc412b-d9eb-42e7-8317-55bc8eb10cf5/resourcegroups/packt-mastering-azure-machine-learning/workspaces/packt
