Load workspace

In [1]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, "loaded")


mldevelopment loaded


Get Datastore and Dataset

In [2]:
from azureml.core import Datastore, Dataset

dstore = Datastore.get(ws, datastore_name='patient_training')

dset = Dataset.get_by_name(ws, 'PatientTrainingModelingData')

Create folder for experiment files

In [3]:
import os

experiment_folder = 'IPVisits_Experiment'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

IPVisits_Experiment folder created


Write Model Training Script

In [4]:
%%writefile $experiment_folder/IPVisits_Training.py
import os
import pandas as pd
import numpy as np
import joblib
import argparse
from azureml.core import Run
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.calibration import calibration_curve
from sklearn.feature_selection import mutual_info_classif

# Fairlearn packages
from fairlearn.metrics import selection_rate, MetricFrame
from raiwidgets import FairnessDashboard
from fairlearn.metrics._group_metric_set import _create_group_metric_set
from azureml.contrib.fairness import upload_dashboard_dictionary, download_dashboard_by_upload_id

# Import libraries for model explanation
from azureml.interpret import ExplanationClient
from interpret.ext.blackbox import TabularExplainer

# Get the experiment run context
run = Run.get_context()

# Get the script argument
parser = argparse.ArgumentParser()
#parser.add_argument('--ntrees', type=int, dest='ntrees', default=10, help='number of trees')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
parser.add_argument("--cost_fp", type=int, dest='cost_fp', default=1, help='relative cost of a false positive')
parser.add_argument("--cost_fn", type=int, dest='cost_fn', default=1, help='relative cost of a false negative')
args = parser.parse_args()
#ntrees = args.ntrees
cost_fp = args.cost_fp
cost_fn = args.cost_fn

# Load the Data
print("Loading Data...")
dset = run.input_datasets['training_data'].to_pandas_dataframe()
#dset=dset.head(10000) #to decrease wait time

# Log raw counts
run.log('raw row count', len(dset))
run.log('raw column count', len(dset.columns))

# Fill in missing values
dfwithmissing = dset[dset.columns[dset.isna().any()]].fillna(dset[dset.columns[dset.isna().any()]].mean()).fillna('Missing') #(df[colswithmissing].mode().iloc[0])
dfnotmissing = dset.drop(dset.columns[dset.isna().any()], axis=1)
dset = pd.concat([dfnotmissing, dfwithmissing], axis=1)
#dset.fillna(dset.mean(), inplace=True)
#dset.fillna('Missing', inplace=True)
print('Imputed missing values...')

# Get sensitive features
S = dset[['FeatureRaceDSC', 'FeatureSexDSC', 'FeatureReligionDSC', 'FeatureLanguageDSC']]
print('Retrieved sensitive features...')

# One hot encode 
features = [col for col in dset.columns if "Feature" in col]
features_to_encode = [col for col in dset.columns[dset.dtypes==object].tolist() if col in features]
dset = pd.get_dummies(dset, columns=features_to_encode)
print('Columns after one hot encoding:', len(dset.columns))

final_features = ['FeatureEmergencyEncountersInPastYearNBR','FeatureInpatientEncountersInPastYearNBR',
 'FeatureHypertensionFLG','FeatureMedicationAnalgesicsNonNarcoticCNT',
 'FeatureMedicationAntiasthmaticCNT','FeatureMedicationMultivitaminsCNT',
 'FeatureAgeNBR','FeatureLabLastAlbuminVAL','FeatureLabLastBloodUreaNitrogenVAL',
 'FeatureLabLastGlucoseVAL','FeatureLabLastHDLVAL','FeatureLabLastLDLVAL',
 'FeatureLabLastPlateletVAL','FeatureLabLastSodiumVAL','FeatureLabLastWhiteBloodCellsVAL',
 'FeatureLabLastHematocritVAL','FeatureLabLastChlorideVAL','FeatureLabLastAlkalinePhosphataseVAL',
 'FeatureLabLastASTVAL','FeatureLabLastALTVAL','FeatureVitalLastDiastolicBloodPressureVAL',
 'FeatureVitalLastSystolicBloodPressureVAL','FeatureVitalLastPulseVAL','FeatureVitalLastRespirationsVAL',
 'FeatureVitalLastWeightVAL','FeatureAlcoholUseDSC_No','FeatureAlcoholUseDSC_Not Asked',
 'FeatureSexuallyActiveDSC_Not Asked','FeatureEmploymentDSC_Full Time','FeatureReligionDSC_Missing']
print('Created final feature set...')

# Separate features and target
X, y = dset[final_features].values, dset['IPVisitNext12MonthsFLG'].values

# Split data into training, validating and testing sets
X_train, X_rest, y_train, y_rest, S_train, S_rest = train_test_split(X, y, S, test_size=0.30, random_state=3456, stratify=y)
X_valid, X_test, y_valid, y_test, S_valid, S_test = train_test_split(X_rest, y_rest, S_rest, test_size=0.50, random_state=123, stratify=y_rest)

# Train logistic regression model
print("Training a logistic regression model...")
model = LogisticRegression(penalty='l1',C=0.00178, solver='liblinear').fit(X_train, y_train)

# Get predicted values
print("Predicting on test set...")
y_scores = model.predict_proba(X_test) # probability predictions
fpr, tpr, threshold = roc_curve(y_test, y_scores[:,1])
cost = (fpr*cost_fp*len(y_test[y_test==0])) + ((1-tpr)*cost_fn*len(y_test[y_test==1]))
best_index = np.argmin(cost) # find index where cost is minimized

best_tpr = tpr[best_index]
best_fpr = fpr[best_index]
best_cost = cost[best_index]
best_cutoff = threshold[best_index] # get optimal threshold based on minimizing cost

y_hat = np.where(y_scores[:,1] > best_cutoff, 1, 0) # class predictions based on optimal threshold

# calculate model evaluation metrics
print("Evaluating overall model performance...")
acc = np.average(y_hat == y_test)
auc = roc_auc_score(y_test,y_scores[:,1]) 
brier = sum((y_scores[:,1] - y_test)**2)/len(y_test) 
confMatrix = confusion_matrix(y_test, y_hat)
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
f1 = f1_score(y_test, y_hat)
precision = precision_score(y_test, y_hat)
recall = recall_score(y_test, y_hat)
bal_acc = balanced_accuracy_score(y_test, y_hat)
prevalence = y_test.sum()/len(y_test)
detection = sum(np.where((y_test ==1) & (y_hat ==1), 1, 0))/len(y_test)
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppv = tp/(tp+fp)
npv = tn/(tn+fn)
kappa = cohen_kappa_score(y_test, y_hat)
selection = selection_rate(y_test, y_hat)

# Log metric results
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))
print('Brier Score: ', brier)
run.log('Brier Score', np.float(brier))
print('Confusion Matrix:', confMatrix)
run.log('Confusion Matrix', confMatrix)
print('F1-Score: ', f1)
run.log('F1-Score', np.float(f1))
print('Precision', precision)
run.log('Precision', np.float(precision))
print('Recall', recall)
run.log('Recall', np.float(recall))
print('Balanced Accuracy', bal_acc)
run.log('Balanced Accuracy', np.float(bal_acc))
print('Prevalence', prevalence)
run.log('Prevalence', np.float(prevalence))
print('Detection Rate', detection)
run.log('Detection Rate', np.float(detection))
print('Sensitivity', sensitivity)
run.log('Sensitivity', np.float(sensitivity))
print('Specificity', specificity)
run.log('Specificity', np.float(specificity))
print('Positive Predictive Value', ppv)
run.log('Positive Predictive Value', np.float(ppv))
print('Negative Predictive Value', np.float(npv))
run.log('Negative Predictive Value',np.float(npv))
print('Selection Rate', np.float(selection))
run.log('Selection Rate', selection)

# ROC curve and log it
plt.title("ROC Curve (AUC: " + str(round(auc, 2)) + ")")
plt.plot(fpr, tpr, label='AUC: ' + str(round(auc, 2)))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
run.log_image('ROC Curve', plot=plt)
plt.clf()

# Calibration curve and log it
prob_true, prob_pred = calibration_curve(y_test, y_scores[:,1], n_bins=10)
plt.title("Calibration Curve")
plt.plot(prob_pred, prob_true, marker='o', linewidth=1)
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
run.log_image('Calibration Curve', plot=plt)

# Save the trained model in the outputs folder
os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/IPVisit_Model.pkl')

# Get fairness in dashboard
sf = {'FeatureRaceDSC': S_test.FeatureRaceDSC, 'FeatureSexDSC':S_test.FeatureSexDSC, 'FeatureReligionDSC':S_test.FeatureReligionDSC, 'FeatureLanguageDSC':S_test.FeatureLanguageDSC}
model_id = 'ipvisits_model:1'
ys_pred = { model_id:y_hat }
dash_dict = _create_group_metric_set(y_true=y_test,
                                    predictions=ys_pred,
                                    sensitive_features=sf,
                                    prediction_type='binary_classification')


dashboard_title = "Fairness insights of IP Visit Classifier"
upload_id = upload_dashboard_dictionary(run,
                                        dash_dict,
                                        dashboard_name=dashboard_title)


# Get explanation in dashboard
labels = ['No ED Visit', 'ED Visit']
#features = [col for col in dset.columns if "Feature" in col]
explainer = TabularExplainer(model, X_train, features=final_features, classes=labels)
explanation = explainer.explain_global(X_test)

# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')

# Complete the run
run.complete()

Overwriting IPVisits_Experiment/IPVisits_Training.py


Define Environment

In [5]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- numpy
- pip
- pip:
  - azureml-defaults
  - pyarrow
  - azureml-interpret
  - raiwidgets
  - fairlearn
  - azureml-contrib-fairness

Overwriting IPVisits_Experiment/experiment_env.yml


Run the training script as an experiment

In [8]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.widgets import RunDetails

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")
print("Environment Created...")

# Get training dataset
ip_ds = ws.datasets.get("PatientTrainingModelingData")
print("Got Training Data...")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='IPVisits_Training.py',
                                arguments = [#'--ntrees', 500,
                                            '--input-data', ip_ds.as_named_input('training_data'),
                                             '--cost_fp',1,
                                             '--cost_fn',6],
                                environment=env) 
print("Created script configuration....")

# submit the experiment run
experiment_name = 'train-ipvisits'
experiment = Experiment(workspace=ws, name=experiment_name)
print("Made experiment...")
run = experiment.submit(config=script_config)
print("Experiment submitted...")

# Show the running experiment run in the notebook widget
RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()

Environment Created...
Got Training Data...
Created script configuration....
Made experiment...
Experiment submitted...


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'train-ipvisits_1636556939_68dc3e16',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-11-10T15:09:01.811736Z',
 'endTimeUtc': '2021-11-10T15:34:39.836859Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'acc4a493-e4f4-4c2e-b20d-66a1128dad2b'},
 'inputDatasets': [{'dataset': {'id': '0e8005de-54f0-42b7-aa63-d6d50d8bc65e'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'IPVisits_Training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--input-data',
   'DatasetConsumptionConfig:training_data',
   '--cost_fp',
   '1',
   '--cost_fn',
   '6'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'training_data': {'dataLocation': {'dataset': {'id': '0e8005de-54f0-42b7-aa63-d6d50d8bc65e',
      'name': 'Pa

In [40]:
# Get logged metrics and files
metrics = run.get_metrics()
for key in metrics.keys():
        print(key, metrics.get(key))
print('\n')
for file in run.get_file_names():
    print(file)

raw_rows 10000
raw_cols 202
processed_rows 10000
processed_cols 338
Accuracy 0.7326666666666667
AUC 0.5925805547991266
Brier Score 0.15483333333333332
Confusion Matrix array([[2041,  566],
       [ 236,  157]])
F1-Score 0.28136200716845877
Precision 0.21715076071922546
Recall 0.3994910941475827
Balanced Accuracy 0.5911916537097714
Prevalence 0.131
Detection Rate 0.052333333333333336
Sensitivity 0.3994910941475827
Specificity 0.7828922132719601
Positive Predictive Value 0.21715076071922546
Negative Predictive Value 0.896354852876592
Selection Rate 0.241
ROC Curve aml://artifactId/ExperimentRun/dcid.train-edvisits_1635438326_3a574ba9/ROC Curve_1635438470.png
Calibration Curve aml://artifactId/ExperimentRun/dcid.train-edvisits_1635438326_3a574ba9/Calibration Curve_1635438470.png
Metrics by group:                      selection_rate    recall precision
FeatureRaceDSC_White                                   
Not White                  0.261759  0.479452  0.273438
White                      

Register the trained model

In [14]:
from azureml.core import Model

# Register the model
run.register_model(model_path='outputs/IPVisit_Model.pkl', model_name='ipvisits_model',
                   tags={'Training context':'Script'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy'],
                              'Brier Score': run.get_metrics()['Brier Score'], 
                               'Confusion Matrix': run.get_metrics()['Confusion Matrix'],
                               'F1-Score': run.get_metrics()['F1-Score'], 'Precision': run.get_metrics()['Precision'],
                               'Recall': run.get_metrics()['Recall'], 
                               'Balanced Accuracy': run.get_metrics()['Balanced Accuracy'],
                               'Prevalence': run.get_metrics()['Prevalence'],
                               'Detection Rate': run.get_metrics()['Detection Rate'],
                               'Sensitivity': run.get_metrics()['Sensitivity'],
                               'Specificity': run.get_metrics()['Specificity'],
                               'Positive Predictive Value': run.get_metrics()['Positive Predictive Value'],
                               'Negative Predictive Value': run.get_metrics()['Negative Predictive Value'],
                               'Selection Rate': run.get_metrics()['Selection Rate']})



# List registered models
#for model in Model.list(ws):
#    print(model.name, 'version:', model.version)
#    for tag_name in model.tags:
#        tag = model.tags[tag_name]
#        print ('\t',tag_name, ':', tag)
#    for prop_name in model.properties:
#        prop = model.properties[prop_name]
#        print ('\t',prop_name, ':', prop)
#    print('\n')

Model(workspace=Workspace.create(name='mldevelopment', subscription_id='c99970ac-0655-46fb-9058-4407cd79c5cb', resource_group='unitypointhealthdatascience'), name=ipvisits_model, id=ipvisits_model:1, version=1, tags={'Training context': 'Script'}, properties={'AUC': '0.7476514164326192', 'Accuracy': '0.8654174449197646', 'Brier Score': '0.04756087778335743', 'Confusion Matrix': 'array([[167778,  21032],\n       [  5772,   4582]])', 'F1-Score': '0.25478202846975084', 'Precision': '0.1788865464199266', 'Recall': '0.4425342862661773', 'Balanced Accuracy': '0.6655709406014431', 'Prevalence': '0.05198730694302183', 'Detection Rate': '0.02300616577293085', 'Sensitivity': '0.4425342862661773', 'Specificity': '0.8886075949367088', 'Positive Predictive Value': '0.1788865464199266', 'Negative Predictive Value': '0.9667415730337079', 'Selection Rate': '0.12860757968307526'})