## The below script conducts hyperparameter tuning, model deployment, and inference predictions on the holdout test data for both a XGBoost and Linear-Learner model respectively.

### Outline of Notebook:

#### 3.1 Establish environment and define variables
&nbsp;&nbsp;&nbsp;&nbsp;Function: <u>CSV_Reader()</u>
#### 3.2 Perform K-Fold split and convert train datasets to protobuf format
&nbsp;&nbsp;&nbsp;&nbsp;Function: <u>Export_Processed_Protobuf()</u>
#### 3.3 Tune Hyperparameters
#### 3.4 Build Models
#### 3.5 Deploy Models
#### 3.6 Make Batch Predictions on Test Data
#### 3.7 Export test data with predictions
&nbsp;&nbsp;&nbsp;&nbsp;Function: <u>Export_Processed_CSV()</u>

### 3.1 Establish Environment and Define Variables

In [None]:
#pip install --upgrade numexpr

In [None]:
#pip install --upgrade s3fs

#### Importing Libraries

In [None]:
import datetime

In [18]:
#Data Manipulation Libraries
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
current_date = datetime.now()

#Sagemaker/related Libraries
import boto3
import sagemaker
smclient = boto3.Session().client('sagemaker')
s3 = boto3.client('s3')
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac
from sagemaker.session import Session

#Machine Learning Libraries
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.inputs import TrainingInput
from sklearn.model_selection import KFold
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import image_uris

#Protobuf Libraries
import io
from io import StringIO
from io import BytesIO

#### Here we define variables that will be used later

In [2]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = 'diabetes-directory' #main directory
input_prefix = 'diabetes_processed_data'  #sub-directory
k_folder = 'k' #subdirectory for k-fold datasets

test_dataset = 'reduced_dimensions_diabetes_test.csv' #holdout test set
train_dataset = "reduced_dimensions_diabetes_train.csv" #set to be converted to protobuf and trained upon
xgb_full_train_proto_filename = "xgb_full_train_proto.data" #final training dataset to train XGBoost model on optimized hyperparameters
train_proto_filename = "train_proto.data" #final training dataset to train a Linear Learner model on optimized hyperparameters
validation_proto_filename = "validation_proto.data" #final validation dataset to train a Linear Learner model on optimized hyperparameters
test_with_predictions = "test_with_predictions.csv" #exporting our test dataset once all our predictions have been made

feature_dim = 61 #The number of features we will use to train our linear learner model
linear_job_name = "diabetes-job-linear" #job name for the linear learner model

#### Defining filepaths that we will use later

In [3]:
s3_train_proto_filepath = 's3://{}/{}/{}/{}'.format(bucket, input_prefix, k_folder, train_proto_filename)
hyperparam_output_filepath = "s3://{}/{}/hyperparam_output".format(bucket, input_prefix)
XGB_model_output_filepath = "s3://{}/{}/xgb_output".format(bucket, input_prefix)
Linear_model_output_filepath = "s3://{}/{}/linear_output".format(bucket, input_prefix)
linear_training_data_location = 's3://{}/{}/{}/{}'.format(bucket, input_prefix, k_folder, train_proto_filename)
linear_validation_data_location = 's3://{}/{}/{}/{}'.format(bucket, input_prefix, k_folder, validation_proto_filename)

In [4]:
def CSV_Reader(bucket, subfolder, source_file_name):
    
    '''The CSV_Reader() function takes in the names of the bucket, subfolder, and source file name, and desired dataframe name. 
    It first constructs the filepath, and then imports the file residing at this filepath, based on the title fed to the function.
    
    Arguments 
    --------- 
    bucket: Head S3 repository bucket
    subfolder: Subfolder containing the source data
    source_file_name: Name of source CSV data file 
    
    Return
    ---------
    Returns the source data in a pandas dataframe '''
    
    data_location = 's3://{}/{}/{}'.format(bucket, subfolder, source_file_name)  
    dataset = pd.read_csv(data_location, low_memory=False, header='infer')
    return dataset

#### Reading in the train dataset

In [5]:
diabetes = CSV_Reader(bucket, input_prefix, train_dataset)

In [6]:
print(diabetes.shape)
diabetes.sample()

(85096, 62)


Unnamed: 0,readmitted,num_lab_procedures,num_medications,time_in_hospital,number_inpatient,num_procedures,number_diagnoses,number_outpatient,number_emergency,gender_Male,...,discharge_disposition_hospice,diag_2_blooddis,diag_1_mentaldis,diag_2_infection,diag_1_skin,race_Hispanic,diag_3_neoplasm,age_3,max_glu_serum_>300,max_glu_serum_>200
79364,0,49,28,3,0,3,9,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### 3.2 Perform K-Fold split and convert train datasets to protobuf format

Here we take the data-preparation-steps required to train our models and optimize our hyperparameters

#### Preparing a function that will convert a dataset into protobuf format and export to S3

In [7]:
def Export_Processed_Protobuf(bucket, processed_data_folder, k_folder, local_file, S3_file_name):
    
    '''Exports a dataframe in protobuf format, and sends it to a specified S3 bucket location
    
    Arguments 
    --------- 
    bucket: A list of the columns (i.e. the 3 diagnosis columns) to be updated
    processed_data_folder: the relevant subfolder within the main bucket
    local_file_name: The name of the dataframe within the notebook
    S3_file_name: The name of the file upon export (with .data extension included)
    
    Return
    ---------
    Exports a protobuf file to a specified S3 location'''
    
#Here we seperate out the input and output values
    X_values = local_file.drop(columns='readmitted').values
    y_values = local_file['readmitted'].values
    
#Here we set up our code to transform the data    
    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, X_values.astype('float32'), y_values.astype('float32'))
    f.seek(0)
    
#Here we upload the data    
    boto3.Session().resource('s3').Bucket(bucket).Object('{}/{}/{}'.format(processed_data_folder, k_folder, S3_file_name)).upload_fileobj(f)
    training_recordIO_protobuf_location = 's3://{}/{}/{}/{}'.format(bucket, processed_data_folder, k_folder, S3_file_name)
    
    print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))

Protobuf data format provides a more computationally-effecient means of processing data for our multiple training jobs (relative to .csv format). 

Therefore, it is good operational practice to convert our finalized training data into protobuf format, as it will be computed upon extensively during the training process. Once a finalized model has been trained off of this data, our holdout test dataset (still in csv format) can be run through the model, providing easily interpretable inference on our final test dataset.

#### Exporting the full train dataset to S3 in protobuf. This will be later used to train our final XGBoost model.

In [8]:
Export_Processed_Protobuf(bucket, input_prefix, k_folder, diabetes, xgb_full_train_proto_filename)

The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/xgb_full_train_proto.data


#### Here, we create an 80/20 split on our training data for our final Linear Learner model. 

While our final XGBoost model can be trained off of a single comprehensive train dataset, our final Linear Learner model requires a seperate training and validation dataset (separate from our K-fold datasets) for its training. We provide this below.

In [9]:
total_rows = len(diabetes)

# Set the seed for reproducibility
np.random.seed(42)

# Generate random indices for the train and test sets
indices = np.random.permutation(total_rows)
train_size = int(0.8 * total_rows)

# Use the first 80% of indices for training, the rest for testing
train_indices, test_indices = indices[:train_size], indices[train_size:]

# Create training and test sets using the selected indices
optimized_linear_train_set = diabetes.iloc[train_indices]  # For pandas DataFrame
optimized_linear_test_set = diabetes.iloc[test_indices]

In [10]:
print(optimized_linear_train_set.shape)
print(optimized_linear_test_set.shape)

(68076, 62)
(17020, 62)


#### Here we export our final Linear Learner Train/Validation datasets

In [11]:
Export_Processed_Protobuf(bucket, input_prefix, k_folder, optimized_linear_train_set, train_proto_filename)
Export_Processed_Protobuf(bucket, input_prefix, k_folder, optimized_linear_test_set, validation_proto_filename)

The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/train_proto.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/validation_proto.data


#### Here we specify indicies for our different 5-K-Folds. These 5 folds will create 5 distinct permeations of train/validation datasets for our model to train on

In [12]:
# Specifies the number of folds for k-fold cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Use globals() to create variables dynamically
for fold, (train_index, valid_index) in enumerate(kf.split(diabetes), 1):
    # Creates variables for train and valid datasets dynamically
    globals()[f'train_data_{fold}'] = diabetes.iloc[train_index].copy()
    globals()[f'validation_data_{fold}'] = diabetes.iloc[valid_index].copy()

In [13]:
train_data_1.sample()

Unnamed: 0,readmitted,num_lab_procedures,num_medications,time_in_hospital,number_inpatient,num_procedures,number_diagnoses,number_outpatient,number_emergency,gender_Male,...,discharge_disposition_hospice,diag_2_blooddis,diag_1_mentaldis,diag_2_infection,diag_1_skin,race_Hispanic,diag_3_neoplasm,age_3,max_glu_serum_>300,max_glu_serum_>200
1917,0,25,12,3,0,3,6,2,0,0,...,0,0,0,0,0,1,0,0,0,0


#### The below code uses our code indecies to export our 10 different datasets to be used in our hyperparameter tuning (5 training and 5 validation)

In [14]:
for i in range(1, 6):
    train_dataset_name = f'train_data_{i}'
    train_dataset = globals()[train_dataset_name]
    train_proto_filename = f'{train_dataset_name}.data'
    Export_Processed_Protobuf(bucket, input_prefix, k_folder, train_dataset, train_proto_filename)
    
    validation_dataset_name = f'validation_data_{i}'
    validation_dataset = globals()[validation_dataset_name]
    validation_csv_filename = f'{validation_dataset_name}.data'
    Export_Processed_Protobuf(bucket, input_prefix, k_folder, validation_dataset, validation_csv_filename)

The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/train_data_1.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/validation_data_1.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/train_data_2.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/validation_data_2.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/train_data_3.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/validation_data_3.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/train_data_4.data
The Pipe mode recordIO protobuf training data: s3://diabetes-directory/diabetes_processed_data/k/validation_data_4.data
The Pipe mode recordIO protobuf training data: s3://diabetes

#### The below code makes a list of the different train/validation filepath pairings to be referenced in our hyperparameter tuning jobs

In [15]:
train_filepath = []
validation_filepath = []

for i in range(1, 6):
    train_filepath.append(f's3://{bucket}/{input_prefix}/{k_folder}/train_data_{i}.data')
    validation_filepath.append(f's3://{bucket}/{input_prefix}/{k_folder}/validation_data_{i}.data')

filepaths_list = list(zip(train_filepath, validation_filepath))
print(filepaths_list)

[('s3://diabetes-directory/diabetes_processed_data/k/train_data_1.data', 's3://diabetes-directory/diabetes_processed_data/k/validation_data_1.data'), ('s3://diabetes-directory/diabetes_processed_data/k/train_data_2.data', 's3://diabetes-directory/diabetes_processed_data/k/validation_data_2.data'), ('s3://diabetes-directory/diabetes_processed_data/k/train_data_3.data', 's3://diabetes-directory/diabetes_processed_data/k/validation_data_3.data'), ('s3://diabetes-directory/diabetes_processed_data/k/train_data_4.data', 's3://diabetes-directory/diabetes_processed_data/k/validation_data_4.data'), ('s3://diabetes-directory/diabetes_processed_data/k/train_data_5.data', 's3://diabetes-directory/diabetes_processed_data/k/validation_data_5.data')]


## 3.3 Tuning Hyperparameters

Here we determine the optimal hyperparameters for both our XGBoost and Linear Learner models.

### 3.3.1 XGBoost Hyperparameters

#### Below we configure a range of hyperparameters, and related variables, for use in our  XGBoost model

In [20]:
XGB_tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        },

      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 50,
      "MaxParallelTrainingJobs": 5
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:auc",
      "Type": "Maximize"
    },
    "RandomSeed" : 123
  }

#### Here we define the 5 sequential hyperparameter tuning jobs which we will run on our XGBoost model, with our 5 K-Folds

In [30]:
XGB_tuning_job_names = []

for i in range(0, 5):
    # sagemaker.image_uris.retrieve
    training_image = sagemaker.image_uris.retrieve(framework='xgboost', region='us-east-1', version='1.0-1')

    # Identifying the optimal hyperparameters, and specifying input/output file paths
    XGB_training_job_definition = {
        "AlgorithmSpecification": {
            "TrainingImage": training_image,
            "TrainingInputMode": "Pipe"
        },
        "InputDataConfig": [
            {
                "ChannelName": "train",
                "CompressionType": "None",
                "ContentType": "application/x-recordio-protobuf",  # Change content type to protobuf
                "DataSource": {
                    "S3DataSource": {
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3Uri": filepaths_list[i][0]
                    }
                }
            },
            {
                "ChannelName": "validation",
                "CompressionType": "None",
                "ContentType": "application/x-recordio-protobuf",  # Change content type to protobuf
                "DataSource": {
                    "S3DataSource": {
                        "S3DataDistributionType": "FullyReplicated",
                        "S3DataType": "S3Prefix",
                        "S3Uri": filepaths_list[i][1]
                    }
                }
            }
        ],
        "OutputDataConfig": {
            "S3OutputPath": hyperparam_output_filepath
        },
        "ResourceConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.c4.2xlarge",
            "VolumeSizeInGB": 10
        },
        "RoleArn": role,
        "StaticHyperParameters": {
            "eval_metric": "auc",
            "num_round": "100",
            "objective": "binary:logistic",
            "rate_drop": "0.3",
            "tweedie_variance_power": "1.4"
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 43200
        }
    }

    # Tuning and training
    XGB_tuning_job_name = f'dia-extrgboo-{i+1}'
    XGB_tuning_job_names.append(XGB_tuning_job_name)

    smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName=XGB_tuning_job_name,
                                               HyperParameterTuningJobConfig=XGB_tuning_job_config,
                                               TrainingJobDefinition=XGB_training_job_definition)


<b>Important</b> It should be noted that the above cell takes a few minutes to run, as it launches multiple hyperparameter tuning jobs. If running this script in your own environment, please wait for these to finish before continuing to run the remainder of the script.

#### Here we will go through the results from our training jobs, and select those with the best ROC AUC score. These jobs, and their corresponding hyperparameters, will be recorded and applied to our final trained model.

In [31]:
XGB_best_job_names = []
XGB_best_job_hyperparameters = []
XGB_auc_scores = []

for i in range(0, 5):
    tuner = sagemaker.HyperparameterTuningJobAnalytics(XGB_tuning_job_names[i])
    XGB_best_training_job = tuner.dataframe().sort_values('FinalObjectiveValue', ascending=False).iloc[0]
    XGB_best_training_job_name = XGB_best_training_job['TrainingJobName']

    XGB_response = smclient.describe_training_job(TrainingJobName=XGB_best_training_job_name)
    XGB_best_hyperparameters = XGB_response['HyperParameters']
    
    # Append values to the lists
    XGB_best_job_names.append(XGB_best_training_job_name)
    XGB_best_job_hyperparameters.append(XGB_best_hyperparameters)

    # Extract AUC from training job metrics
    XGB_training_job_metrics = smclient.describe_training_job(TrainingJobName=XGB_best_training_job_name)['FinalMetricDataList']
    for metric in XGB_training_job_metrics:
        if metric['MetricName'] == 'validation:auc':
            auc_value = metric['Value']
            break
    
    XGB_auc_scores.append(auc_value)

Aggregating results

In [32]:
XGB_top_jobs = pd.DataFrame(list(zip(XGB_best_job_names, XGB_auc_scores)), columns = ['job_name', 'auc_scores']).sort_values(by='auc_scores', ascending = False)

Assessing best individual jobs

In [33]:
XGB_top_jobs

Unnamed: 0,job_name,auc_scores
4,dia-extrgboo-5-040-84b04cad,0.67265
0,dia-extrgboo-1-026-6d4cf70d,0.66995
1,dia-extrgboo-2-027-010f3947,0.66844
3,dia-extrgboo-4-020-96151d6b,0.66478
2,dia-extrgboo-3-039-bb7f03f3,0.65979


In [34]:
XGB_top_jobs.iloc[0,0]

'dia-extrgboo-5-040-84b04cad'

#### Identifying the best hyperparameters for the model

In [35]:
XGB_response = smclient.describe_training_job(TrainingJobName=XGB_top_jobs.iloc[0,0])
XGB_best_hyperparameters = XGB_response['HyperParameters']

Identifying the best set of hyperparameters- which we will feed into our optimized model

### 3.3.2 Linear Learner Hyperparameters

#### Below we configure a range of hyperparameters, and related variables, for use in our Linear Learner model

In [16]:
LL_tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [
            {
                "Name": "use_bias",
                "Values": ["true", "false"]
            }
        ],
        "ContinuousParameterRanges": [
            {
                "Name": "l1",
                "MaxValue": "1.0",
                "MinValue": "0.0001",
                "ScalingType": "Logarithmic"
            },
            {
                "Name": "learning_rate",
                "MaxValue": "1.0",
                "MinValue": "0.0001",
                "ScalingType": "Logarithmic"
            },
            {
                "Name": "wd",
                "MaxValue": "1.0",
                "MinValue": "0.0001",
                "ScalingType": "Logarithmic"
            }
        ],
        "IntegerParameterRanges": [
            {
                "Name": "mini_batch_size",
                "MaxValue": "5000",
                "MinValue": "500",
                "ScalingType": "Linear"
            }
        ]
    },
    "ResourceLimits": {
        "MaxNumberOfTrainingJobs": 50,
        "MaxParallelTrainingJobs": 5
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
        "MetricName": "validation:roc_auc_score",
        "Type": "Maximize"
    }
}

#### Here we define the 5 sequential hyperparameter tuning jobs which we will run on our Linear Learner model, with our 5 K-Folds

In [19]:
# Specify SageMaker estimator
linear_learner_image = sagemaker.image_uris.retrieve("linear-learner", region='us-east-1', version='1')

LL_tuning_job_names = []

for i in range(0, 5):
    # Identifying the optimal hyperparameters
    LL_training_job_definition = {
        "AlgorithmSpecification": {
            "TrainingImage": linear_learner_image,
            "TrainingInputMode": "Pipe",
        },
        "RoleArn": role,
        "InputDataConfig": [
            {
                "ChannelName": "train",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": filepaths_list[i][0],
                        "S3DataDistributionType": "FullyReplicated",
                    }
                },
                "CompressionType": "None",
                "ContentType": "application/x-recordio-protobuf",  # Set content type to protobuf
            },
            {
                "ChannelName": "validation",
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": filepaths_list[i][1],
                        "S3DataDistributionType": "FullyReplicated",
                    }
                },
                "CompressionType": "None",
                "ContentType": "application/x-recordio-protobuf",  # Set content type to protobuf
            },
        ],
        "OutputDataConfig": {
            "S3OutputPath": hyperparam_output_filepath,
        },
        "ResourceConfig": {
            "InstanceCount": 1,
            "InstanceType": "ml.c4.2xlarge",
            "VolumeSizeInGB": 10,
        },
        "StaticHyperParameters": {
            "predictor_type": "binary_classifier",
            "feature_dim": str(feature_dim),
            #"mini_batch_size": "500",
            "epochs": "15",
            "loss": "auto",
            "normalize_data": "true",
            "normalize_label": "auto",
            #"wd": "auto",
            "optimizer": "auto",
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 43200,
        },
    }

    # Tuning and training
    LL_tuning_job_name = f'dll-{i+1}-{datetime.now().strftime("%Y%m%d%H%M%S")}'
    LL_tuning_job_names.append(LL_tuning_job_name)

    smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName=LL_tuning_job_name,
                                               HyperParameterTuningJobConfig=LL_tuning_job_config,
                                               TrainingJobDefinition=LL_training_job_definition)

<b>Important</b> It should be noted that the above cell takes a few minutes to run, as it launches multiple hyperparameter tuning jobs. If running this script in your own environment, please wait for these to finish before continuing to run the remainder of the script.

#### Here we will go through the results from our training jobs, and select those with the best ROC AUC score. These jobs, and their corresponding hyperparameters, will be recorded and applied to our final trained model.

In [36]:
LL_best_job_names = []
LL_best_job_hyperparameters = []
LL_auc_scores = []

for i in range(0, 5):
    tuner = sagemaker.HyperparameterTuningJobAnalytics(LL_tuning_job_names[i])
    LL_best_training_job = tuner.dataframe().sort_values('FinalObjectiveValue', ascending=False).iloc[0]
    LL_best_training_job_name = LL_best_training_job['TrainingJobName']

    LL_response = smclient.describe_training_job(TrainingJobName=LL_best_training_job_name)
    LL_best_hyperparameters = LL_response['HyperParameters']
    
    # Append values to the lists
    LL_best_job_names.append(LL_best_training_job_name)
    LL_best_job_hyperparameters.append(LL_best_hyperparameters)

    # Extract AUC from training job metrics
    LL_training_job_metrics = smclient.describe_training_job(TrainingJobName=LL_best_training_job_name)['FinalMetricDataList']
    for metric in LL_training_job_metrics:
        if metric['MetricName'] == 'validation:roc_auc_score':
            auc_value = metric['Value']
            break
    
    LL_auc_scores.append(auc_value)

Aggregating results

In [24]:
LL_top_jobs = pd.DataFrame(list(zip(LL_best_job_names, LL_auc_scores)), columns = ['job_name', 'auc_scores']).sort_values(by='auc_scores', ascending = False)

Assessing best individual jobs

In [25]:
LL_top_jobs

Unnamed: 0,job_name,auc_scores
4,dll-5-20231222172130-050-885d7370,0.662167
3,dll-4-20231222172130-044-cd31ef2f,0.660985
0,dll-1-20231222172125-021-403fa0cc,0.65979
1,dll-2-20231222172126-037-def96f72,0.654809
2,dll-3-20231222172128-050-8fb1c734,0.652701


In [26]:
LL_top_jobs.iloc[0,0]

'dll-5-20231222172130-050-885d7370'

#### Identifying the best hyperparameters for the model

In [27]:
LL_response = smclient.describe_training_job(TrainingJobName=LL_top_jobs.iloc[0,0])
LL_best_hyperparameters = LL_response['HyperParameters']

Identifying the best set of hyperparameters- which we will feed into our optimized model

In [37]:
LL_best_hyperparameters

{'_tuning_objective_metric': 'validation:roc_auc_score',
 'epochs': '15',
 'feature_dim': '61',
 'l1': '1.0',
 'learning_rate': '0.0029353939183486017',
 'loss': 'auto',
 'mini_batch_size': '500',
 'normalize_data': 'true',
 'normalize_label': 'auto',
 'optimizer': 'auto',
 'predictor_type': 'binary_classifier',
 'use_bias': 'true',
 'wd': '0.00010000000000000009'}

### 3.4 Training Optimized Models

#### In the below cell, we now train an XGBoost model with the hyperparameters specified in the above training job(s).

In [38]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", 'us-east-1', "1.7-1")

# construct a SageMaker estimator that calls the xgboost-container
xgb_estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          #The optimal hyperparameters from tuning are brought into our new model
                                          hyperparameters=XGB_best_hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=XGB_model_output_filepath)

# define the data type and paths to the training and validation datasets
content_type = "application/x-recordio-protobuf"
train_input = TrainingInput(s3_train_proto_filepath, content_type=content_type)
#validation_input = TrainingInput(s3_test_filepath, content_type=content_type)

# execute the XGBoost training job
xgb_estimator.fit({'train': train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-12-22-18-02-40-470


2023-12-22 18:02:40 Starting - Starting the training job...
2023-12-22 18:02:56 Starting - Preparing the instances for training.........
2023-12-22 18:04:20 Downloading - Downloading input data...
2023-12-22 18:04:50 Downloading - Downloading the training image......
2023-12-22 18:05:45 Training - Training image download completed. Training in progress..[34m[2023-12-22 18:06:02.338 ip-10-2-67-26.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-12-22 18:06:02.361 ip-10-2-67-26.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-12-22:18:06:02:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-12-22:18:06:02:INFO] Failed to parse hyperparameter _tuning_objective_metric value validation:auc to Json.[0m
[34mReturning the value itself[0m
[34m[2023-12-22:18:06:02:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2023-12-22

.706 AUC

#### In the below cell, we now train an Linear Learner model with the hyperparameters specified in the above training job(s).

In [None]:
%%time
sess = sagemaker.Session()

linear_container = image_uris.retrieve('linear-learner', boto3.Session().region_name, '1')

# Setup the LinearLeaner algorithm from the ECR container
linear_estimator = sagemaker.estimator.Estimator(linear_container,
                                       role,
                                       hyperparameters=LL_best_hyperparameters,
                                       instance_count=1, 
                                       instance_type='ml.c4.xlarge',
                                       output_path=Linear_model_output_filepath,
                                       sagemaker_session=sess,
                                       input_mode='Pipe')


# Launch a training job. This method calls the CreateTrainingJob API call
data_channels = {
    'train': linear_training_data_location,
    'validation': linear_validation_data_location
}
linear_estimator.fit(data_channels, job_name=linear_job_name)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: diabetes-job-linear


2023-12-22 18:06:53 Starting - Starting the training job...
2023-12-22 18:07:09 Starting - Preparing the instances for training.........
2023-12-22 18:08:37 Downloading - Downloading input data...
2023-12-22 18:09:22 Downloading - Downloading the training image........

The optimal AUC 0.6590648260645642

### 3.5 Deploy Optimized Model

#### Below we deploy our models for inference. The type and instance count of underlying cumpute resources are specified.

In [None]:
xgb_deployed_predictor = xgb_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
linear_deployed_predictor = linear_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-

### 3.6 Making predictions

#### Now that our models are deployed, we can run our previously unseen test dataset through the model and record the predictions.

 We can then ammend these predictions back to the test dataset as additional columns, and conduct statistical/operational assessments on each model's performance.

#### Reading in the holdout test dataset

In [42]:
diabetes_test = CSV_Reader(bucket, input_prefix, test_dataset)

In [43]:
diabetes_test.shape

(15018, 62)

In [44]:
diabetes_test.columns

Index(['readmitted', 'num_lab_procedures', 'num_medications',
       'time_in_hospital', 'number_inpatient', 'num_procedures',
       'number_diagnoses', 'number_outpatient', 'number_emergency',
       'gender_Male', 'admission_source_id_1', 'change', 'diag_3_Nothing',
       'age_9', 'diag_2_Nothing', 'medication_insulin', 'diag_2_respiratory',
       'admission_type_id_3', 'diag_2_urogenital', 'age_6',
       'medication_glyburide', 'diag_3_respiratory', 'diag_3_metabolic',
       'medication_metformin', 'discharge_disposition_hhealth',
       'diag_1_digestive', 'diag_1_respiratory', 'diag_1_injury', 'age_5',
       'discharge_disposition_outpatient', 'diag_3_urogenital',
       'discharge_disposition_nursing', 'diag_3_injury', 'diag_1_Nothing',
       'any_medication', 'diag_1_urogenital', 'A1Cresult_Norm',
       'diag_3_digestive', 'discharge_disposition_unknown', 'A1Cresult_>7',
       'diag_2_skin', 'admission_source_id_8', 'diag_3_other',
       'discharge_disposition_hospital

#### Seperating out the dependent and independent variables of the test dataset

In [45]:
test_X = diabetes_test.drop(columns='readmitted').values
test_y = diabetes_test['readmitted'].values

### Predictions on the XGBoost model

#### Serializing/deserializing model

In [46]:
xgb_deployed_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_deployed_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

#### Creating a list of prediction values

In [47]:
xgb_predictions = []
xgb_results = xgb_deployed_predictor.predict(test_X)
xgb_predictions += [r['score'] for r in xgb_results['predictions']]

print(len(xgb_predictions))

15018


#### Adding these predictions back to the dataset as a new column

In [48]:
diabetes_test['xgb_predictions'] = xgb_predictions

In [49]:
diabetes_test.shape

(15018, 63)

In [50]:
diabetes_test.sample()

Unnamed: 0,readmitted,num_lab_procedures,num_medications,time_in_hospital,number_inpatient,num_procedures,number_diagnoses,number_outpatient,number_emergency,gender_Male,...,diag_2_blooddis,diag_1_mentaldis,diag_2_infection,diag_1_skin,race_Hispanic,diag_3_neoplasm,age_3,max_glu_serum_>300,max_glu_serum_>200,xgb_predictions
4294,0,36,11,1,0,0,3,0,0,0,...,0,0,0,0,0,0,1,0,0,0.051485


### Predictions on the Linear Learner model

#### Serializing/deserializing model

In [51]:
linear_deployed_predictor.serializer = sagemaker.serializers.CSVSerializer()
linear_deployed_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

#### Creating a list of prediction values

In [52]:
linear_predictions = []
linear_results = linear_deployed_predictor.predict(test_X)
linear_predictions += [r['score'] for r in linear_results['predictions']]

print(len(linear_predictions))

15018


#### Adding these predictions back to the dataset as a new column

In [53]:
diabetes_test['linear_predictions'] = linear_predictions

In [54]:
diabetes_test.shape

(15018, 64)

In [55]:
diabetes_test.columns

Index(['readmitted', 'num_lab_procedures', 'num_medications',
       'time_in_hospital', 'number_inpatient', 'num_procedures',
       'number_diagnoses', 'number_outpatient', 'number_emergency',
       'gender_Male', 'admission_source_id_1', 'change', 'diag_3_Nothing',
       'age_9', 'diag_2_Nothing', 'medication_insulin', 'diag_2_respiratory',
       'admission_type_id_3', 'diag_2_urogenital', 'age_6',
       'medication_glyburide', 'diag_3_respiratory', 'diag_3_metabolic',
       'medication_metformin', 'discharge_disposition_hhealth',
       'diag_1_digestive', 'diag_1_respiratory', 'diag_1_injury', 'age_5',
       'discharge_disposition_outpatient', 'diag_3_urogenital',
       'discharge_disposition_nursing', 'diag_3_injury', 'diag_1_Nothing',
       'any_medication', 'diag_1_urogenital', 'A1Cresult_Norm',
       'diag_3_digestive', 'discharge_disposition_unknown', 'A1Cresult_>7',
       'diag_2_skin', 'admission_source_id_8', 'diag_3_other',
       'discharge_disposition_hospital

### Stacked Model Predictions

#### Lastly, we will create a List of "Stacked" predictions that incorporate both of our models at the same time.

This is accomplished by simply scaling the prediction of each model as desired (in our case, 50-50) so that each prediction is comprised of the relationships produced by both models. This may enable relationships caputured by one model, but not the other, to be expressed within the final outcome.

In [56]:
diabetes_test["stacked_predictions"] = (diabetes_test["xgb_predictions"]+diabetes_test["linear_predictions"])/2

In [57]:
print(diabetes_test.shape)
diabetes_test.columns

(15018, 65)


Index(['readmitted', 'num_lab_procedures', 'num_medications',
       'time_in_hospital', 'number_inpatient', 'num_procedures',
       'number_diagnoses', 'number_outpatient', 'number_emergency',
       'gender_Male', 'admission_source_id_1', 'change', 'diag_3_Nothing',
       'age_9', 'diag_2_Nothing', 'medication_insulin', 'diag_2_respiratory',
       'admission_type_id_3', 'diag_2_urogenital', 'age_6',
       'medication_glyburide', 'diag_3_respiratory', 'diag_3_metabolic',
       'medication_metformin', 'discharge_disposition_hhealth',
       'diag_1_digestive', 'diag_1_respiratory', 'diag_1_injury', 'age_5',
       'discharge_disposition_outpatient', 'diag_3_urogenital',
       'discharge_disposition_nursing', 'diag_3_injury', 'diag_1_Nothing',
       'any_medication', 'diag_1_urogenital', 'A1Cresult_Norm',
       'diag_3_digestive', 'discharge_disposition_unknown', 'A1Cresult_>7',
       'diag_2_skin', 'admission_source_id_8', 'diag_3_other',
       'discharge_disposition_hospital

### 3.6 Export test data with predictions

#### We will now export the finished dataset, complete with XGBoost, Linear Learner, and Stacked model predictions, for further analysis in the next script

In [58]:
def Export_Processed_CSV(bucket, processed_data_folder, local_file_name, S3_file_name, header_presence):
        
    '''Exports a dataframe into CSV format, and sends to a specified S3 bucket location
    
    Arguments 
    --------- 
    bucket: A list of the columns (i.e. the 3 diagnosis columns) to be updated
    processed_data_folder: the relevant subfolder within the main bucket
    local_file_name: The name of the dataframe within the notebook
    S3_file_name: The name of the file uppn export (with .csv extension included)
    header_presence: whether or not a header will be present within the exported csv
    
    Return
    ---------
    Exports a csv file to a specified S3 location'''
    
    local_file_name.to_csv(S3_file_name, index=False, header=header_presence)
    boto3.Session().resource('s3').Bucket(bucket).Object('{}/{}'.format(processed_data_folder, S3_file_name)).upload_file(S3_file_name)

In [59]:
diabetes_test.columns

Index(['readmitted', 'num_lab_procedures', 'num_medications',
       'time_in_hospital', 'number_inpatient', 'num_procedures',
       'number_diagnoses', 'number_outpatient', 'number_emergency',
       'gender_Male', 'admission_source_id_1', 'change', 'diag_3_Nothing',
       'age_9', 'diag_2_Nothing', 'medication_insulin', 'diag_2_respiratory',
       'admission_type_id_3', 'diag_2_urogenital', 'age_6',
       'medication_glyburide', 'diag_3_respiratory', 'diag_3_metabolic',
       'medication_metformin', 'discharge_disposition_hhealth',
       'diag_1_digestive', 'diag_1_respiratory', 'diag_1_injury', 'age_5',
       'discharge_disposition_outpatient', 'diag_3_urogenital',
       'discharge_disposition_nursing', 'diag_3_injury', 'diag_1_Nothing',
       'any_medication', 'diag_1_urogenital', 'A1Cresult_Norm',
       'diag_3_digestive', 'discharge_disposition_unknown', 'A1Cresult_>7',
       'diag_2_skin', 'admission_source_id_8', 'diag_3_other',
       'discharge_disposition_hospital

In [60]:
test_with_predictions = "test_with_predictions.csv"

In [61]:
Export_Processed_CSV(bucket, input_prefix, diabetes_test, test_with_predictions, True)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Please continue to the fourth script in this repository: 4. Evaluation_on_Test_Data