## Creating an Estimator and start a training job

In [1]:
import numpy as np
import pandas as pd
import json
import s3fs
import boto3
import io
import gc
import tarfile
import time
import pickle as pkl
import tarfile
import os

import sagemaker 
from sagemaker.serializers import CSVSerializer
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.xgboost.model import XGBoostModel

## Setup Model Parameter and Train the model
### Skip those steps to deploy model step if you already have a model

In [3]:
!pygmentize ./scripts/xgboost_train.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mpickle[39;49;00m [34mas[39;49;00m [04m[36mpkl[39;49;00m
[34mimport[39;49;00m [04m[36mxgboost[39;49;00m [34mas[39;49;00m [04m[36mxgb[39;49;00m
[34mimport[39;49;00m [04m[36ms3fs[39;49;00m


[34mif[39;49;00m [31m__name__[39;49;00m == [33m'[39;49;00m[33m__main__[39;49;00m[33m'[39;49;00m:
    parser = argparse.ArgumentParser()

    [37m# Hyperparameters are described here[39;49;00m
    parser.add_argument([33m'[39;49;00m[33m--num_round[39;49;00m[33m'[39;49;00m, [36mtype[39;49;00m=[36mint[39;49;00m)
    parser.add_argument([33m'[39;49;00m[33m--max_depth[39;49;00m[33m'[39;49;00m, [36mtype[39;49;00m=[36min

In [4]:
hyperparameters = {'num_round':'50',
                   'max_depth':'5',
                   'eta':'0.2',
                   'gamma':'4',
                   'min_child_weight':'6',
                   'subsample':'0.7',
                   'objective':'binary:logistic',
                   'eval_metric': 'auc',
                   'verbosity':'1'}


xgb_estimator = XGBoost(entry_point='xgboost_train.py',
                        framework_version = '1.2-2',
                        source_dir='./scripts',
                        hyperparameters=hyperparameters,
                        role=sagemaker.get_execution_role(),
                        instance_count=1,
                        instance_type='ml.m5.2xlarge',
                        requirements_file='requirements.txt',
                        use_spot_instances=True,
                        max_run=300,
                        max_wait=600)

In [5]:
xgb_estimator.fit()

2021-11-08 16:14:43 Starting - Starting the training job...
2021-11-08 16:15:12 Starting - Launching requested ML instancesProfilerReport-1636388083: InProgress
......
2021-11-08 16:16:13 Starting - Preparing the instances for training......
2021-11-08 16:17:13 Downloading - Downloading input data...
2021-11-08 16:17:33 Training - Downloading the training image...
2021-11-08 16:18:13 Training - Training image download completed. Training in progress.[34m[2021-11-08 16:17:59.616 ip-10-0-136-231.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-11-08:16:17:59:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-11-08:16:17:59:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-11-08:16:17:59:INFO] Invoking user training script.[0m
[34m[2021-11-08:16:18:00:INFO] Module xgboost_train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2021-11-08:16:18:00:INFO] Generating setup.cfg[0m
[34m[2021-1

[34m[0]#011train-auc:0.99693#011validation-auc:0.99657[0m
[34m[1]#011train-auc:0.99756#011validation-auc:0.99841[0m
[34m[2]#011train-auc:0.99796#011validation-auc:0.99847[0m
[34m[3]#011train-auc:0.99811#011validation-auc:0.99842[0m
[34m[4]#011train-auc:0.99816#011validation-auc:0.99847[0m
[34m[5]#011train-auc:0.99827#011validation-auc:0.99845[0m
[34m[6]#011train-auc:0.99833#011validation-auc:0.99845[0m
[34m[7]#011train-auc:0.99839#011validation-auc:0.99844[0m
[34m[8]#011train-auc:0.99840#011validation-auc:0.99846[0m
[34m[9]#011train-auc:0.99862#011validation-auc:0.99841[0m
[34m[10]#011train-auc:0.99864#011validation-auc:0.99838[0m
[34m[11]#011train-auc:0.99885#011validation-auc:0.99927[0m
[34m[12]#011train-auc:0.99894#011validation-auc:0.99939[0m
[34m[13]#011train-auc:0.99903#011validation-auc:0.99937[0m
[34m[14]#011train-auc:0.99906#011validation-auc:0.99935[0m
[34m[15]#011train-auc:0.99908#011validation-auc:0.99936[0m
[34m[16]#011train-auc:0.99910#011

In [7]:
# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{xgb_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{xgb_estimator.latest_training_job.name}\n")

s3 uri where the trained model is located: 
s3://sagemaker-us-east-1-943601785668/sagemaker-xgboost-2021-11-08-16-14-43-294/output/model.tar.gz

latest training job name for this estimator: 
sagemaker-xgboost-2021-11-08-16-14-43-294



## Load Trained jobs 
### Directly load in model if you already have one

In [2]:
xgb_estimator = sagemaker.estimator.Estimator.attach('sagemaker-xgboost-2021-11-08-16-14-43-294')


2021-11-08 16:20:54 Starting - Preparing the instances for training
2021-11-08 16:20:54 Downloading - Downloading input data
2021-11-08 16:20:54 Training - Training image download completed. Training in progress.
2021-11-08 16:20:54 Uploading - Uploading generated training model
2021-11-08 16:20:54 Completed - Training job completed


## Deploy Models

In [3]:
xgb_predictor = xgb_estimator.deploy(initial_instance_count=1,instance_type='ml.m4.2xlarge')

--------!

## View Evaluation Details on Train

In [4]:
start_time = time.time()

# Load in data (since it's small)
train = pd.read_csv('s3://compressed-data-sample/train_embedding_yfirst.csv', header=None)

# Predict by batch to prevent request timeout
TIMEOUT_LIMIT = 400
train_predictions_array = []
for i in range(np.ceil(len(train)/TIMEOUT_LIMIT).astype(int)):
    # Get batch and get rid of label column
    train_byte = train.iloc[TIMEOUT_LIMIT*i:TIMEOUT_LIMIT*(i+1), 1:].to_csv(index=False).encode('utf-8') #load the data into an array
    train_predictions = xgb_predictor.predict(train_byte, initial_args={'ContentType': 'text/csv'}).decode('utf-8') # predict!
    train_predictions_array.append(np.fromstring(train_predictions, sep=',')[1:]) # and turn the prediction into an array

train_predictions_array = np.concatenate(train_predictions_array)
end_time = time.time()
print(f'Train prediction shape: {train_predictions_array.shape}')
print(f'Prediction time for {len(train)} samples: {end_time - start_time} seconds')

Train prediction shape: (80000,)
Prediction time for 80000 samples: 291.56421756744385 seconds


In [5]:
cm = pd.crosstab(index=train[0], columns=np.round(train_predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]
fn = cm.iloc[1,0]
tp = cm.iloc[1,1]
fp = cm.iloc[0,1]
p = (tp+tn)/(tp+tn+fp+fn)*100

print("\n{0:<20}{1:<4.1f}%\n".format("Overall Train Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NonPeace", "Peace"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NonPeace", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Peace", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Train Classification Rate: 99.1%

Predicted      NonPeace          Peace
Observed
NonPeace       99% (39558)     1% (357)
Peace           1% (359)     99% (39726) 



In [6]:
# Clean-up Memory
del train
gc.collect()

49

## View Evaluation Details on Test (Full Set)

In [7]:
start_time = time.time()
test = pd.read_csv('s3://compressed-data-sample/test_embedding.csv', header=None)
test_predictions_array = []

for i in range(np.ceil(len(test)/TIMEOUT_LIMIT).astype(int)):
    test_byte = test.iloc[TIMEOUT_LIMIT*i:TIMEOUT_LIMIT*(i+1), 1:-1].to_csv(index=False).encode('utf-8') #load the data into an array
    test_predictions = xgb_predictor.predict(test_byte, initial_args={'ContentType': 'text/csv'}).decode('utf-8') # predict!
    test_predictions_array.append(np.fromstring(test_predictions, sep=',')[1:]) # and turn the prediction into an array

test_predictions_array = np.concatenate(test_predictions_array)
end_time = time.time()

print(f'Test prediction shape: {test_predictions_array.shape}')
print(f'Prediction time for {len(test)} samples: {end_time - start_time} seconds')

Test prediction shape: (10000,)
Prediction time for 10000 samples: 34.61984467506409 seconds


In [8]:
cm = pd.crosstab(index=test[0], columns=np.round(test_predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]
fn = cm.iloc[1,0]
tp = cm.iloc[1,1]
fp = cm.iloc[0,1]
p = (tp+tn)/(tp+tn+fp+fn)*100

print("\n{0:<20}{1:<4.1f}%\n".format("Overall Test Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NonPeace", "Peace"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NonPeace", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Peace", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Test Classification Rate: 98.2%

Predicted      NonPeace          Peace
Observed
NonPeace       98% (4995)     2% (83)
Peace           2% (95)     98% (4827) 



## View Evaluation Details on Test (Remove India & Australia)

In [12]:
start_time = time.time()
test = test[~test.iloc[:, -1].isin(['India', 'Australia'])]
print(f'Test Size : {len(test)}')

test_predictions_array = []

for i in range(np.ceil(len(test)/TIMEOUT_LIMIT).astype(int)):
    test_byte = test.iloc[TIMEOUT_LIMIT*i:TIMEOUT_LIMIT*(i+1), 1:-1].to_csv(index=False).encode('utf-8') #load the data into an array
    test_predictions = xgb_predictor.predict(test_byte, initial_args={'ContentType': 'text/csv'}).decode('utf-8') # predict!
    test_predictions_array.append(np.fromstring(test_predictions, sep=',')[1:]) # and turn the prediction into an array

test_predictions_array = np.concatenate(test_predictions_array)
end_time = time.time()

print(f'Test prediction shape: {test_predictions_array.shape}')
print(f'Prediction time for {len(test)} samples: {end_time - start_time} seconds')

Test Size : 1633
Test prediction shape: (1633,)
Prediction time for 1633 samples: 4.291927337646484 seconds


In [13]:
cm = pd.crosstab(index=test[0], columns=np.round(test_predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]
fn = cm.iloc[1,0]
tp = cm.iloc[1,1]
fp = cm.iloc[0,1]
p = (tp+tn)/(tp+tn+fp+fn)*100

print("\n{0:<20}{1:<4.1f}%\n".format("Overall Test (Minority Group) Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NonPeace", "Peace"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NonPeace", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Peace", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Test (Minority Group) Classification Rate: 95.9%

Predicted      NonPeace          Peace
Observed
NonPeace       90% (420)     2% (20)
Peace           10% (47)     98% (1146) 



In [16]:
precision = tp /(tp+fp)
recall = tp /(tp+fn)
print(f'Precision: {100*precision:.4f} %')
print(f'Recall: {100*recall:.4f} %')
print(f'F1: {100*2*precision*recall/(precision+recall):.4f} %' )

Precision: 98.2847 %
Recall: 96.0604 %
F1: 97.1598 %


In [14]:
del test
gc.collect()

45

## Delete the deployed Model, Configuration and Endpoints

In [15]:
deployment_name = xgb_predictor.endpoint_name
client = boto3.client('sagemaker')
response = client.describe_endpoint_config(EndpointConfigName=deployment_name)

model_name = response['ProductionVariants'][0]['ModelName']
client.delete_model(ModelName=model_name)    
client.delete_endpoint(EndpointName=deployment_name)
client.delete_endpoint_config(EndpointConfigName=deployment_name)

{'ResponseMetadata': {'RequestId': '72549625-13c5-4fa8-b851-332f141445f4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '72549625-13c5-4fa8-b851-332f141445f4',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 09 Nov 2021 14:33:49 GMT'},
  'RetryAttempts': 0}}