In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.16.0 to work with agogemls


In [2]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'titanic_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

In [3]:
%%writefile $experiment_folder/titanic_data_preparation.py

from azureml.core import Run
import pandas
import argparse
import numpy as np
import os

# Get the experiment run context
run = Run.get_context()

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--output_folder', type=str, dest='output_folder', help='output folder')
args = parser.parse_args()
output_folder = args.output_folder

# load the titanic dataset
print("Loading Data...")
train_df = run.input_datasets['Titanic'].to_pandas_dataframe()

#Data cleansing & Feature engineering
train_df = train_df.drop(['Ticket', 'Cabin', 'PassengerId'], axis=1)

train_df['Title'] = train_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train_df['Title'] = train_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
train_df['Title'] = train_df['Title'].replace('Mlle', 'Miss')
train_df['Title'] = train_df['Title'].replace('Ms', 'Miss')
train_df['Title'] = train_df['Title'].replace('Mme', 'Mrs')

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train_df['Title'] = train_df['Title'].map(title_mapping)
train_df['Title'] = train_df['Title'].fillna(0).astype(int)
train_df = train_df.drop(['Name'], axis=1)

train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
train_df['IsAlone'] = 0
train_df.loc[train_df['FamilySize'] == 1, 'IsAlone'] = 1
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)

train_df['Sex'] = train_df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

freq_port = train_df.Embarked.dropna().mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(freq_port)
train_df['Embarked'] = train_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

guess_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        guess_df = train_df[(train_df['Sex'] == i) & (train_df['Pclass'] == j + 1)]['Age'].dropna()
        age_guess = guess_df.median()
        # Convert random age float to nearest .5 age
        guess_ages[i, j] = int(age_guess / 0.5 + 0.5) * 0.5

for i in range(0, 2):
    for j in range(0, 3):
            train_df.loc[(train_df.Age.isnull()) & (train_df.Sex == i) & (train_df.Pclass == j + 1),
                         'Age'] = guess_ages[i, j]

train_df['Age'] = train_df['Age'].astype(int)

train_df.loc[ train_df['Fare'] <= 7.91, 'Fare'] = 0
train_df.loc[(train_df['Fare'] > 7.91) & (train_df['Fare'] <= 14.454), 'Fare'] = 1
train_df.loc[(train_df['Fare'] > 14.454) & (train_df['Fare'] <= 31), 'Fare']   = 2
train_df.loc[ train_df['Fare'] > 31, 'Fare'] = 3
train_df['Fare'] = train_df['Fare'].astype(int)

# Save prepared data
os.makedirs(output_folder, exist_ok=True)
output_file = output_folder + '\prepared_data.csv'
train_df.to_csv(output_file, index = False)

run.complete()

Overwriting titanic_pipeline/titanic_data_preparation.py


In [4]:
%%writefile $experiment_folder/titanic_modelling.py

from azureml.core import Run
import pandas
import argparse
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Get the experiment run context
run = Run.get_context()

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.05, help='regularization rate')
parser.add_argument('--data_folder', type=str, dest='data_folder', default="data", help='data location')
parser.add_argument('--output_folder', type=str, dest='output_folder', default="titanic_model", help='output folder')
args = parser.parse_args()
reg = args.reg_rate
data_folder = args.data_folder
output_folder = args.output_folder

# load prepared data
data_file = data_folder + "/prepared_data.csv"
print("Loading Data from " + data_file)
train_df = pandas.read_csv(data_file)

# Separate features and labels
X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the trained model in the outputs folder
os.makedirs(output_folder, exist_ok=True)
output_path = output_folder + "/titanic_model.pkl"
joblib.dump(value=model, filename=output_path)

# Complete the run
run.complete()

Overwriting titanic_pipeline/titanic_modelling.py


In [5]:
%%writefile $experiment_folder/register_model.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder', default="titanic_model", help='model location')
args = parser.parse_args()
model_folder = args.model_folder

# Get the experiment run context
run = Run.get_context()

# load the model
print("Loading model from " + model_folder)
model_file = model_folder + "/titanic_model.pkl"
model = joblib.load(model_file)

Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'titanic_model',
               tags={'Training context':'Pipeline'})

run.complete()

Writing titanic_pipeline/register_model.py


In [9]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
titanic_env = Environment("titanic-env")
titanic_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
titanic_env.docker.enabled = True # Use a docker container

packages = CondaDependencies.create(conda_packages=['scikit-learn','joblib','numpy', 'pandas'],
                                    pip_packages=['azureml-defaults','azureml-dataprep[pandas]'])
titanic_env.python.conda_dependencies = packages

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the remote compute
cluster_name = "d12compute"
pipeline_run_config.target = cluster_name

# Assign the environment to the run configuration
pipeline_run_config.environment = titanic_env

In [10]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.estimator import Estimator
from azureml.train.sklearn import SKLearn

# Get the training dataset
titanic_ds = ws.datasets.get("Titanic")

# Default datastore (Azure blob storage)
def_blob_store = ws.get_default_datastore()

# Define intermediate data using PipelineData
# Naming the intermediate data as processed_data and assigning it to the variable processed_data.
processed_data = PipelineData("processed_data",datastore=def_blob_store).as_dataset()
model_folder = PipelineData("model_folder", datastore=def_blob_store)

# Create a data preparation step
dataprep_step = PythonScriptStep(name='Data Preparation',
                                 script_name='titanic_data_preparation.py',
                                 source_directory=experiment_folder,
                                 compute_target=cluster_name,
                                 inputs=[titanic_ds.as_named_input('Titanic')],
                                 outputs=[processed_data],
                                 arguments=["--output_folder", processed_data],
                                 runconfig = pipeline_run_config,
                                 allow_reuse = True)

# Create a data modelling step
estimator = SKLearn(source_directory=experiment_folder,
                      compute_target = cluster_name,
                      entry_script='titanic_modelling.py')

train_step = EstimatorStep(name = "Train Model",
                           estimator=estimator, 
                           estimator_entry_script_arguments=['--regularization', 0.4, '--data_folder', processed_data, '--output_folder', model_folder],
                           inputs=[processed_data],
                           outputs=[model_folder],
                           compute_target = cluster_name,
                           allow_reuse = True)

register_step = PythonScriptStep(name = "Register Model",
                                source_directory = experiment_folder,
                                script_name = "register_model.py",
                                arguments = ['--model_folder', model_folder],
                                inputs=[model_folder],
                                compute_target = cluster_name,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")


Pipeline steps defined


In [11]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline


# Construct the pipeline
pipeline_steps = [dataprep_step, train_step, register_step]
pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment
experiment_name = 'Titanic-experiment-4'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment
run = experiment.submit(pipeline)
#run = experiment.submit(pipeline, regenerate_outputs=True)

print("Pipeline submitted for execution.")
run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Data Preparation [3ef0f670][b84022a6-db76-461d-bf57-6cf9dd9b6ec2], (This step is eligible to reuse a previous run's output)
Created step Train Model [5b1aaf62][04fd9883-ec41-464a-a8d4-d51f7a0cd6e4], (This step is eligible to reuse a previous run's output)
Created step Register Model [5e64809d][400dd729-a8d5-4817-9e29-c56bd19352a6], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 305664eb-9776-459f-9c20-25e9d293f028
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Titanic-experiment-4/runs/305664eb-9776-459f-9c20-25e9d293f028?wsid=/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourcegroups/agoge/workspaces/agogemls
Pipeline submitted for execution.
PipelineRunId: 305664eb-9776-459f-9c20-25e9d293f028
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Titanic-experiment-4/runs/305664eb-9776-459f-9c20-25e9d293f028?wsid=/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourc


mkl-2019.4           | 204.1 MB  | ########## | 100% 

pip-20.2.4           | 2.0 MB    |            |   0% 
pip-20.2.4           | 2.0 MB    | ########## | 100% 

ncurses-6.0          | 907 KB    |            |   0% 
ncurses-6.0          | 907 KB    | ########## | 100% 

scipy-1.5.2          | 18.5 MB   |            |   0% 
scipy-1.5.2          | 18.5 MB   | 5          |   6% 
scipy-1.5.2          | 18.5 MB   | ##8        |  29% 
scipy-1.5.2          | 18.5 MB   | ####5      |  45% 
scipy-1.5.2          | 18.5 MB   | ######4    |  64% 
scipy-1.5.2          | 18.5 MB   | #######8   |  79% 
scipy-1.5.2          | 18.5 MB   | #########1 |  92% 
scipy-1.5.2          | 18.5 MB   | ########## | 100% 

pytz-2020.1          | 239 KB    |            |   0% 
pytz-2020.1          | 239 KB    | ########## | 100% 

six-1.15.0           | 13 KB     |            |   0% 
six-1.15.0           | 13 KB     | ########## | 100% 

tk-8.6.10            | 3.2 MB    |            |   0% 
tk-8.6.10            


#
# To activate this environment, use
#
#     $ conda activate /azureml-envs/azureml_290f1590c72e373fda6f30b3edac22cc
#
# To deactivate an active environment, use
#
#     $ conda deactivate

[91m

  current version: 4.7.12
  latest version: 4.9.0

Please update conda by running

    $ conda update -n base -c defaults conda


Removing intermediate container fc50b3eb8a87
 ---> ca7c43f47e92
Step 9/15 : ENV PATH /azureml-envs/azureml_290f1590c72e373fda6f30b3edac22cc/bin:$PATH
 ---> Running in e79070345f83
Removing intermediate container e79070345f83
 ---> 39cc73cbaa1f
Step 10/15 : ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/azureml_290f1590c72e373fda6f30b3edac22cc
 ---> Running in 61b36814b4a8
Removing intermediate container 61b36814b4a8
 ---> 799a2792bc9c
Step 11/15 : ENV LD_LIBRARY_PATH /azureml-envs/azureml_290f1590c72e373fda6f30b3edac22cc/lib:$LD_LIBRARY_PATH
 ---> Running in 12a8d0edadeb
Removing intermediate container 12a8d0edadeb
 ---> afe4db4b4124
Step 12/15 : COPY azureml-e


Streaming azureml-logs/75_job_post-tvmps_9904d3a246fd52d5d4f216b93123c3f1d1acaa7d21202bc39c916ed3562937b3_d.txt
Entering job release. Current time:2020-10-23T14:04:31.550144
Starting job release. Current time:2020-10-23T14:04:32.771359
Logging experiment finalizing status in history service.
[2020-10-23T14:04:32.772551] job release stage : upload_datastore starting...
[{}] job release stage : start importing azureml.history._tracking in run_history_release.
[2020-10-23T14:04:32.772818] job release stage : execute_job_release starting...
[2020-10-23T14:04:32.773107] job release stage : copy_batchai_cached_logs starting...
[2020-10-23T14:04:32.773157] job release stage : copy_batchai_cached_logs completed...
Starting the daemon thread to refresh tokens in background for process with pid = 346
[2020-10-23T14:04:32.782895] Entering context manager injector.
[2020-10-23T14:04:33.207096] job release stage : upload_datastore completed...
[2020-10-23T14:04:33.298394] job release stage : send_




StepRunId: 440cbf11-4be6-4788-94c5-e9d38e050805
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Titanic-experiment-4/runs/440cbf11-4be6-4788-94c5-e9d38e050805?wsid=/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourcegroups/agoge/workspaces/agogemls
StepRun( Train Model ) Status: NotStarted
StepRun( Train Model ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_9904d3a246fd52d5d4f216b93123c3f1d1acaa7d21202bc39c916ed3562937b3_d.txt
2020-10-23T14:05:06Z Starting output-watcher...
2020-10-23T14:05:06Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2020-10-23T14:05:07Z Executing 'Copy ACR Details file' on 10.0.0.5
2020-10-23T14:05:07Z Copy ACR Details file succeeded on 10.0.0.5. Output: 
>>>   
>>>   
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_2a0e99d2b6c0b56ef3ce1012e5647b1d
Digest: sha256:f5f59812a786bbc7bb4729dd515b55c1b35d2af44ff2883115bca712e218a96c
Status: Image is up to date for 8b


Streaming azureml-logs/70_driver_log.txt
2020/10/23 14:05:17 logger.go:297: Attempt 1 of http call to http://10.0.0.5:16384/sendlogstoartifacts/info
2020/10/23 14:05:17 logger.go:297: Attempt 1 of http call to http://10.0.0.5:16384/sendlogstoartifacts/status
[2020-10-23T14:05:19.045691] Entering context manager injector.
[context_manager_injector.py] Command line Options: Namespace(inject=['ProjectPythonPath:context_managers.ProjectPythonPath', 'Dataset:context_managers.Datasets', 'RunHistory:context_managers.RunHistory', 'TrackUserError:context_managers.TrackUserError'], invocation=['titanic_modelling.py', '--regularization', '0.4', '--data_folder', 'DatasetConsumptionConfig:processed_data', '--output_folder', '/mnt/batch/tasks/shared/LS_root/jobs/agogemls/azureml/440cbf11-4be6-4788-94c5-e9d38e050805/mounts/workspaceblobstore/azureml/440cbf11-4be6-4788-94c5-e9d38e050805/model_folder'])
Initialize DatasetContextManager.
Starting the daemon thread to refresh tokens in background for pr




StepRunId: 52622574-0af2-4af5-be2d-f9c87c3f3625
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/Titanic-experiment-4/runs/52622574-0af2-4af5-be2d-f9c87c3f3625?wsid=/subscriptions/da21a094-26a3-472f-991b-e2b11979af40/resourcegroups/agoge/workspaces/agogemls
StepRun( Register Model ) Status: NotStarted

Streaming azureml-logs/55_azureml-execution-tvmps_9904d3a246fd52d5d4f216b93123c3f1d1acaa7d21202bc39c916ed3562937b3_d.txt
2020-10-23T14:07:05Z Starting output-watcher...
2020-10-23T14:07:05Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
StepRun( Register Model ) Status: Running
2020-10-23T14:07:06Z Executing 'Copy ACR Details file' on 10.0.0.5
2020-10-23T14:07:06Z Copy ACR Details file succeeded on 10.0.0.5. Output: 
>>>   
>>>   
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_e67ca57afeabe2cc88e95355880aeb71
Digest: sha256:ec65e5ec37187f87a84bd2a736c8dadfb45578762775dfbebf98bbe7e97875dd
Status: Image is up to date 


Streaming azureml-logs/75_job_post-tvmps_9904d3a246fd52d5d4f216b93123c3f1d1acaa7d21202bc39c916ed3562937b3_d.txt
Entering job release. Current time:2020-10-23T14:07:27.525958
Starting job release. Current time:2020-10-23T14:07:28.617604
Logging experiment finalizing status in history service.
[2020-10-23T14:07:28.633511] job release stage : upload_datastore starting...
[{}] job release stage : start importing azureml.history._tracking in run_history_release.
[2020-10-23T14:07:28.633848] job release stage : execute_job_release starting...
[2020-10-23T14:07:28.634187] job release stage : copy_batchai_cached_logs starting...
Starting the daemon thread to refresh tokens in background for process with pid = 157
[2020-10-23T14:07:28.634551] job release stage : copy_batchai_cached_logs completed...
[2020-10-23T14:07:28.644449] Entering context manager injector.
[2020-10-23T14:07:29.050618] job release stage : upload_datastore completed...
[2020-10-23T14:07:29.132291] job release stage : send_



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '305664eb-9776-459f-9c20-25e9d293f028', 'status': 'Completed', 'startTimeUtc': '2020-10-23T13:56:05.084878Z', 'endTimeUtc': '2020-10-23T14:08:01.21074Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://agogemls1641700925.blob.core.windows.net/azureml/ExperimentRun/dcid.305664eb-9776-459f-9c20-25e9d293f028/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=7gO1gw8A6COdnG4gNdjEJMxVUS2w96yDxFsJZa5I4EI%3D&st=2020-10-23T13%3A58%3A03Z&se=2020-10-23T22%3A08%3A03Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://agogemls1641700925.blob.core.windows.net/azureml/ExperimentRun/dcid.305664eb-9776-459f-9c20-25e9d293f028/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=XzOVKMrUd8axsiZr4ANLktcWmUzTdmU5cpppr7Yk%2B9I%3D&st=2020-10-23T13%3A58%3A03Z&se=2020-

'Finished'

In [12]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

titanic_model version: 1
	 Training context : Pipeline


Titanic version: 5
	 Training context : Remote Estimator
	 Regularization Rate : 0.45
	 AUC : 0.8230025183150182
	 Accuracy : 0.7985074626865671


Titanic version: 4
	 Training context : Remote Estimator
	 Regularization Rate : 0.4
	 AUC : 0.822945283882784
	 Accuracy : 0.7947761194029851


Titanic version: 3
	 Training context : Estimator
	 Regularization Rate : 0.4
	 AUC : 0.822945283882784
	 Accuracy : 0.7947761194029851


Titanic version: 2
	 Training context : Estimator
	 AUC : 0.823345924908425
	 Accuracy : 0.7798507462686567


Titanic version: 1
	 Training context : Local
	 AUC : 0.823345924908425
	 Accuracy : 0.7798507462686567


