## Final Project XGBoost Model

### 0.a) Import packages

In [1]:
import boto3, os, sagemaker
from sagemaker import get_execution_role
import warnings
import numpy as np
warnings.filterwarnings("ignore")

### 0.b) Get Amazon IAM execution role and instance region

In [2]:
# Define IAM role- this will be necessary when defining your model
iam_role = get_execution_role()

# set the region of the instance and get a reference to the client
my_region = boto3.Session().region_name

# sagemaker session handle
sm_session = sagemaker.session.Session(boto3.Session())

print("Success - the SageMaker instance is in the " + my_region + " region")

Success - the SageMaker instance is in the us-west-2 region


## 1) Load and examine data


### 1.a) Load CSV data from s3, and the dictionary pickle from local


In [3]:
import pandas as pd
import pickle

bucket = "bdproject2"
prefix = "data"

df = pd.read_csv("s3://{}/{}/PS_20174392719_1491204439457_log.csv".format(bucket,prefix))

### 1.b) Examine data schema and counts

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


### 1.c) Create new features & change data type


In [6]:
df["OrigDiff"] = df['newbalanceOrig'] - df['oldbalanceOrg']
df["DestDiff"] = df['newbalanceDest'] - df['oldbalanceDest']
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,OrigDiff,DestDiff
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0


In [7]:
# one-hot coding
df['payment'] = (df["type"] == "PAYMENT").astype(int)
df['transfer'] = (df["type"] == "TRANSFER").astype(int)
df['cash-out'] = (df["type"] == "CASH_OUT").astype(int)
df['debit'] = (df["type"] == "DEBIT").astype(int)
df['cash-in'] = (df["type"] == "CASH_IN").astype(int)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,OrigDiff,DestDiff,payment,transfer,cash-out,debit,cash-in
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0,1,0,0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0,1,0,0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0,0,1,0,0,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0,0,0,1,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0,1,0,0,0,0


In [8]:
# create a column showing whether amount equals to oldbalanceOrg
df['AmountEqOrig'] = (df['amount'] == df['oldbalanceOrg']).astype(int)


In [9]:
# create a column showing whether the transaction is from customer to customer
df['CusToCus'] = np.where(df['nameOrig'].str[0] == df['nameDest'].str[0], 1, 0)

In [10]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,OrigDiff,DestDiff,payment,transfer,cash-out,debit,cash-in,AmountEqOrig,CusToCus
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0,1,0,0,0,0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0,1,0,0,0,0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0,0,1,0,0,0,1,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0,0,0,1,0,0,1,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0,1,0,0,0,0,0,0


### 1.d) select columns


In [11]:
colNames = ['amount', 'payment', 'transfer', 'cash-out','debit','cash-in',
                 'OrigDiff', 'DestDiff','AmountEqOrig','CusToCus','isFraud']

In [12]:
# select columns
df_selected = df[colNames]
df_selected.head()

Unnamed: 0,amount,payment,transfer,cash-out,debit,cash-in,OrigDiff,DestDiff,AmountEqOrig,CusToCus,isFraud
0,9839.64,1,0,0,0,0,-9839.64,0.0,0,0,0
1,1864.28,1,0,0,0,0,-1864.28,0.0,0,0,0
2,181.0,0,1,0,0,0,-181.0,0.0,1,1,1
3,181.0,0,0,1,0,0,-181.0,-21182.0,1,1,1
4,11668.14,1,0,0,0,0,-11668.14,0.0,0,0,0


## 2) Prepare data

In [13]:
featureCols =  ['amount', 'payment', 'transfer', 'cash-out','debit','cash-in',
                 'OrigDiff', 'DestDiff','AmountEqOrig','CusToCus','isFraud']
labelCol = 'isFraud'

df_cleaned = df_selected[featureCols]

df_cleaned[labelCol] = df_selected[labelCol]

df_cleaned = df_cleaned.dropna()

print("Number of features is ",len(featureCols))

print("Number of rows in cleaned DataFrame",df_cleaned.shape[0])

Number of features is  11
Number of rows in cleaned DataFrame 6362620


In [14]:
# Make sure label columns is the first column, and drop WAGP (will make problem trivial otherwise)
# XGBoost expects labels to be in the first column
cols = df_cleaned.columns.tolist()
cols.insert(0, cols.pop(cols.index('isFraud')))
df_cleaned2 = df_cleaned.reindex(columns = cols)

In [15]:
df_cleaned2.head()

Unnamed: 0,isFraud,amount,payment,transfer,cash-out,debit,cash-in,OrigDiff,DestDiff,AmountEqOrig,CusToCus
0,0,9839.64,1,0,0,0,0,-9839.64,0.0,0,0
1,0,1864.28,1,0,0,0,0,-1864.28,0.0,0,0
2,1,181.0,0,1,0,0,0,-181.0,0.0,1,1
3,1,181.0,0,0,1,0,0,-181.0,-21182.0,1,1
4,0,11668.14,1,0,0,0,0,-11668.14,0.0,0,0


### 2.d) Save preprocessed data to s3

This step is needed for using XGBoost with Amazon Sagemaker. We have provided this code for you, but you should look through them to see what they do. 

Expected outputs: None

In [16]:
# shuffle data and train/val/test split, at 6:2:2

import numpy as np

x_train_df, x_val_df, x_test_df = \
    np.split(df_cleaned2.sample(frac=1, random_state=1729), 
             [int(0.6 * len(df_cleaned2)), int(0.8*len(df_cleaned2))]) 

x_train = x_train_df.values
x_val = x_val_df.values
x_test = x_test_df.values[:,1:]  # remove the label column

In [17]:
# save preprocessed data to s3 for XGBoost to access

bucket = "bdproject2"       # CHANGE ME: your own bucket
prefix = "data"   # CHANGE ME: your own folder

datasets = [('train', x_train), ('val', x_val), ('test', x_test)]  # Note: also using val as test
data_locations = []
for key, x in datasets:
    file = "{}/{}_data/data.csv".format(prefix,key)
    location = "s3://{}/{}".format(bucket, file); data_locations += [location]
    print('{} data will be uploaded to: {}'.format(key, location))
    np.savetxt("{}.csv".format(key), x, fmt = '%s')      #     delimiter=','
    boto3.Session().resource('s3').Bucket(bucket).Object(file).upload_file("{}.csv".format(key))


train data will be uploaded to: s3://bdproject2/data/train_data/data.csv
val data will be uploaded to: s3://bdproject2/data/val_data/data.csv
test data will be uploaded to: s3://bdproject2/data/test_data/data.csv


In [18]:
# set model output location
output_location = "s3://{}/{}/output".format(bucket,prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://bdproject2/data/output


## 3) Train a XGBoost model with static hyperparameters

### 3.1) Train the model


In [19]:
# create input channels to stream data
# NOTE: in the case of using CSV files, all data is saved to the worker containers before training starts;
#       only libsvm supports real data streaming

train_channel = sagemaker.inputs.s3_input(data_locations[0], content_type='text/csv')
val_channel = sagemaker.inputs.s3_input(data_locations[1], content_type='text/csv')

data_channels = {'train': train_channel, 'validation': val_channel}

In [20]:
# create the estimator instance

from sagemaker.amazon.amazon_estimator import get_image_uri
training_image = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')
smclient = boto3.Session().client('sagemaker')

xgb_model = sagemaker.estimator.Estimator(training_image,
                                          iam_role, 
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.xlarge',
                                          train_volume_size = 5,
                                          output_path=output_location,
                                          sagemaker_session=sagemaker.Session())

In [24]:
# set estimator hyperparameters, these will be passed to the XGBoost container

xgb_model.set_hyperparameters(max_depth = 5,
                              eta = .2,
                              gamma = 4,
                              min_child_weight = 6,
                              objective = "multi:softmax",
                              num_class = 6,
                              num_round = 10)

In [25]:
# fit using the data channels

xgb_model.fit(data_channels)

2020-03-16 03:00:41 Starting - Starting the training job...
2020-03-16 03:00:42 Starting - Launching requested ML instances......
2020-03-16 03:02:09 Starting - Preparing the instances for training......
2020-03-16 03:02:59 Downloading - Downloading input data
2020-03-16 03:02:59 Training - Downloading the training image...
2020-03-16 03:03:21 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ' '[0m
[34mINFO:root:Determined delimiter of CSV input is ' '[0m
[34mINFO:root:Determined delimiter of CSV input is ' '[0

In [26]:
# print the model's S3 URL
xgb_model.model_data

's3://bdproject2/data/output/sagemaker-xgboost-2020-03-16-03-00-41-529/output/model.tar.gz'

### 3.2) Getting a transformer for batch inferencing

A Transformer spawns a Batch-Transform job using a trained model, and predicts labels for pre-uploaded data on S3. This is mainly for when we have a large amount of data to run inference on, and don't need to get results back in real time. 

To see the Batch-Transform job you have started, go to the SageMaker console's Batch Transform Jobs page, under the Inference menu on the left. Click into the Job to see its details. There is a link to the output S3 location at the bottom of the Job details page, which you can use to retrieve the output file containing predicted labels. 



In [27]:
test_output_location = output_location+"/results"
print('test outputs will be uploaded to: {}'.format(test_output_location))

test outputs will be uploaded to: s3://bdproject2/data/output/results


In [28]:
xgb_transformer = xgb_model.transformer(instance_count=1,
                                        instance_type='ml.m5.xlarge',
                                        output_path=test_output_location)

In [29]:
xgb_transformer.transform(data_locations[2],
                          content_type="text/csv")

### 3.3) Deploy model for interactive inference

Real Time Predictors are live services run on Amazon servers, which accepts streamed data (i.e. from live sensors or user input) and uses a trained model to make predictions in real time, hence the name. These predictors are backed by an entire stack of different services and networks, and thus takes some time to startup. 

Since the predictors work only with byte streams and not standard python objects, we have written a simple utility that takes care of data processing for you. The process is not that complicated, so feel free to experiment with the predictor interface yourself. 

After obtaining the predicted labels from the real time predictor, use the confusion_matrix and classification_report methods from sklearn.metrics to show the model's classification performance.


In [28]:
def predict_file(input_path, predictor, limit=1000):
    """ 
    Utility for piping a local test file to Sagemaker's Real Time Predictor interface 
    Returns a numpy array of integers as the predicted labels
    
    input_path: path to the input CSV file. This should be a local file, you can use "test.csv"
    predictor: the predictor object we created for you
    limit: the max number of data points to send to the predictor. We have set this to 1000 for you. 
    """
    res = []
    with open(input_path, 'r') as f:
        lines = f.readlines()
    for i, l in enumerate(lines):
        res += [predictor.predict(l)]
        print("Progress: {}/{}".format(i, len(lines)), end='\r')
        if limit is not None and i >= limit-1:
            break
    y_pred = np.array([float(e.decode('utf-8')) for e in res]).astype(int)
    return y_pred

In [31]:
# Deploy the real time predictor on Amazon servers. This will take several minutes. 
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                 content_type='text/csv',
                                 instance_type='ml.t2.xlarge')

Using already existing model: sagemaker-xgboost-2020-03-16-03-00-41-529


-----------------!

In [32]:
# run inference and get reports
# NOTE: XGBoost's RealTimePredictor accepts only strings in CSV format. 

from sklearn.metrics import confusion_matrix, classification_report

# Hint: Use the predict_file() method we have provided

y_pred = predict_file("test.csv",xgb_predictor,limit = None)

Progress: 1272523/1272524

In [35]:
y_true = x_test_df.iloc[0:,0]

In [44]:
len(y_true)

1272524

In [45]:
confusion_matrix(y_true,y_pred)

array([[1270832,       0],
       [      8,    1684]])

In [46]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(1270832, 0, 8, 1684)

In [37]:
# Classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270832
           1       1.00      1.00      1.00      1692

   micro avg       1.00      1.00      1.00   1272524
   macro avg       1.00      1.00      1.00   1272524
weighted avg       1.00      1.00      1.00   1272524



In [47]:
# F1 score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
f1 = metrics.fbeta_score(y_true, y_pred, beta = 1)
f1

0.9976303317535545

In [48]:
fbeta = metrics.fbeta_score(y_true, y_pred, beta = 2)
fbeta

0.9962139138665405

In [40]:
# AUC
roc_auc_score(y_true, y_pred)

0.9976359338061466

In [49]:
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
roc_auc = metrics.auc(fpr, tpr) # compute area under the curve

In [None]:
# delete the endpoint after inference to save cost; you may also delete the model.
xgb_predictor.delete_endpoint()
# xgb_predictor.delete_model()

### 4.1) Start a Hyperparameter Tuning Job


In [21]:
xgb_model.set_hyperparameters(max_depth = 5,
                              eta = .2,
                              gamma = 4,
                              min_child_weight = 6,
                              objective = "multi:softmax",
                              num_class = 6,
                              num_round = 10)

In [22]:
# You need to change the job name every time. 
# Amazon likely uses job logs to bill users, so jobs cannot be deleted or overwritten.
tuning_job_name = "MyTuningJob9"   

# tuning configs: parameter ranges
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {                    
          "MaxValue": "0.5",
          "MinValue": "0.1",
          "Name": "eta"
        },
        {                    
          "MaxValue": "1",
          "MinValue": "0.5",
          "Name": "subsample"
        },
        {                    
          "MaxValue": "10",
          "MinValue": "0",
          "Name": "min_child_weight"
        },
          
       
      ],
      "IntegerParameterRanges": [
        {"MaxValue": "10",
         "MinValue": "1",
         "Name": "max_depth" }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 20,    # Set the number of iterations here
      "MaxParallelTrainingJobs": 3      # Set the max concurrent training jobs
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:f1",
      "Type": "Maximize"
    }
  }

# Training job definition: data sources, instance config, static hyperparams
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": training_image,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": data_locations[0]
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": data_locations[1]
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": output_location
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": iam_role,
    "StaticHyperParameters": {        # Set static hyperparameters here, these won't be tuned
      "gamma": "4",
      "objective": "multi:softmax",
      "num_class": "6",
      "num_round": "10"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 3600
    }
}

In [23]:
# create hyperparam tuning job
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                           HyperParameterTuningJobConfig = tuning_job_config,
                                           TrainingJobDefinition = training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-west-2:431838000283:hyper-parameter-tuning-job/mytuningjob9',
 'ResponseMetadata': {'RequestId': 'c9c6d22a-3131-407a-a4bd-9e59dee68d40',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c9c6d22a-3131-407a-a4bd-9e59dee68d40',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '113',
   'date': 'Tue, 17 Mar 2020 08:28:49 GMT'},
  'RetryAttempts': 0}}

### 4.2) Rerun validation to examine model performance


In [31]:
from sagemaker.model import Model

model_location_s3 = "https://bdproject2.s3-us-west-2.amazonaws.com/data/output/MyTuningJob9-012-3ce738e4/output/model.tar.gz"  # S3 Path to your model artifact

tuned_model = Model(model_location_s3, 
                    image=training_image,
                    role=iam_role, 
                    sagemaker_session=sm_session
                   )
tuned_model.deploy(initial_instance_count=1,
                   instance_type='ml.m4.xlarge',
                  )

-------------!

In [32]:
from sagemaker.predictor import RealTimePredictor
tuned_predictor = RealTimePredictor(tuned_model.endpoint_name,
                                    sm_session,
                                    content_type='text/csv')

In [None]:
# run inference and get reports
# NOTE: XGBoost's RealTimePredictor accepts only strings in CSV format. 

from sklearn.metrics import confusion_matrix, classification_report

y_pred_tuned = predict_file("test.csv",tuned_predictor,limit = None)


Progress: 1272523/1272524

In [36]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_tuned).ravel()
(tn, fp, fn, tp)

(1270830, 2, 8, 1684)

In [37]:
# Classification_report
print(classification_report(y_true, y_pred_tuned))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270832
           1       1.00      1.00      1.00      1692

   micro avg       1.00      1.00      1.00   1272524
   macro avg       1.00      1.00      1.00   1272524
weighted avg       1.00      1.00      1.00   1272524



In [38]:
# F1 score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
f1 = metrics.fbeta_score(y_true, y_pred_tuned, beta = 1)
f1

0.9970396684428657

In [39]:
fbeta = metrics.fbeta_score(y_true, y_pred_tuned, beta = 2)
fbeta

0.9959782351549561

In [40]:
# AUC
roc_auc_score(y_true, y_pred_tuned)

0.9976351469200752

In [41]:
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred_tuned)
roc_auc = metrics.auc(fpr, tpr) # compute area under the curve

In [42]:
# delete model and endpoint after inference to save cost
tuned_predictor.delete_endpoint()
# tuned_predictor.delete_model()