# Training Models in Azure 

Can create scripts to train models using the workspace framework which incorporates scikitlearn, Tensorflow, PyTorch, SparkML, etc. Training scripts can be run and tracked.

In [None]:
# Connext to workspace using the Azure ML SDK
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Read to use Azure ML {}'.format(azureml.core.VERSION, ws.name))

In [None]:
# Create Training Script 
# Create script to train a ml model based on diabetes data
# Create folder for script and data files
import os, shutil

# Create a folder fo rhte experiment files
training_folder = 'diabetes-training'
os.makedirs(training_folder, exist_ok=True)

# Copy the datafile into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(training_folder, "diabetes.csv"))

In [None]:
# create training script and save in folder
%%writefile $training_folder/diabetes_training.py
# import libraries
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 

# Get the experiment run context
run = Run.get_context()

# Load the diabetes dataset
print("Loading Data ...")
diabetes = pd.read_csv("diabetes.csv")

# Seperate features and labels
# Take subset of important data features
X, y = diabetes[['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure', 
                 'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree',
                 'Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Set regularization hyperparameter
reg = 0.01

# Train logistic regression model
print('Training a logistric regression model with regularization rate of ', reg)
run.log('Regularization rate: ', np.flot(reg))
model = LogisticRegression(C=1/reg, solver='liblinear').fit(X_train, y_train)

# Calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print("Accuracy:", acc)
run.log('Accuracy: ', np.float(acc))

# Calculate Area under the curve 
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:,1])
print('AUC:',str(auc))
run.log('AUC', np.float(auc))

# Save the trained model in the outputs folder
os.makedirs('outputs', exisit_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

## Use Estimator to run script as experiment

Run experiments using two ways 
*1) RunConfiguration and ScriptRunConfig
*2) Estimator - abstracts configurations into a single object 

In [None]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment

# Create an estimator
estimator = Estimator(source_directory=training_folder,
                      entry_script='diabetes_training.py',
                      compute_target='local',
                      conda_packages=['scikit-learn'])

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment based on the estimator
run = experiment.submit(config=estimator)
run.wait_for_completion(show_output=True)


In [None]:
# To view the experiment run output, type in console
from azureml.widgets import RunDetails

RunDetails(run).show() # Click link to Azure Machine Learning studio

In [None]:
# Alternatively, can retrieve matrics and outputs from Run object. 
# get logged metrics
metrics = run.get_metrics()
for key in metrics.key(): 
    print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
    print(file) # will print outputs

### Registering trained models

To keep track of different versions of output models 

In [None]:
from azureml.core import Model

# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context: Estimator'},
                   properties={'AUC': run.get_metrics()['AUC'], 
                               'Accuracy':run.get_metrics()['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

## Create Parameterize Training script

Add parameters to script, enabling to repeat the same experiment with different settings

In [None]:
# Create a folder for the parameterized script and training data 
import os, shutil

# Create folder for the experiment files
training_folder = 'diabetes-training-params'
os.makedirs(training_folder, exist_ok=True)

# Copy data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(training_folder, 'diabetes.csv'))

In [None]:
# Create script containing parameter for the regularization rate hyperparameter
%%writefile $training_folder/diabetes_training.py
# Import libraries
from azureml.core import Run
import pandas as pd
import numpy as np
import joblib
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Get the experiment run context
run = Run.get_context()

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg_rate', type=float, dest='reg', default=0.1)
args = parser.parse_args()
reg = args.reg

# Load the diabetes dataset 
print("Loading Data...")
# load the diabetes dataset
diabetes = pd.read_csv('diabetes.csv')

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.complete()

## Running Framework specific estimator

This below script combined with the above parameterized training script and registering model will allow you to keep track of various reg parameter change models

In [None]:
from azureml.train.sklearn import SKLearn
from azureml.widgets import RunDetails

# Create an estimator
estimator = SKLearn(source_directory=training_folder,
                    entry_script='diabetes_training.py',
                    script_params = {'--reg_rate': 0.1},
                    compute_target='local'
                    )

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment
run = experiment.submit(config=estimator)

# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

In [None]:
# Get logged metrics
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
    print(file)

In [None]:
# Registering the new model in the workspace
from azureml.core import Model

# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context':'Parameterized SKLearn Estimator'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')