In [1]:
import boto3
import botocore
import sagemaker
import sys


bucket = sagemaker.Session().default_bucket()   # Feel free to change to another bucket you have access to
prefix = 'sagemaker/rcf-benchmarks'
execution_role = sagemaker.get_execution_role()


# check if the bucket exists
try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('Hey! You either forgot to specify your S3 bucket'
          ' or you gave your bucket an invalid name!')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
        print("Hey! You don't have permission to access the bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
        print("Hey! Your bucket, {}, doesn't exist!".format(bucket))
    else:
        raise
else:
    print('Training input/output will be stored in: s3://{}/{}'.format(bucket, prefix))

Training input/output will be stored in: s3://sagemaker-us-east-2-602665643546/sagemaker/rcf-benchmarks


In [2]:
%%time

import numpy as np
import pandas as pd
import urllib.request

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

CPU times: user 531 ms, sys: 67.1 ms, total: 598 ms
Wall time: 1.3 s


In [3]:
txn_data = pd.read_csv('data/project_3_transactions_110k.csv')

In [4]:
txn_data.head()

Unnamed: 0,registration_days,recency_days,visit_count,spend_amt,spend_adj_amt,ip_address,email_domain,billing_city,billing_postal,billing_state,txn_code,event_code,EVENT_TIMESTAMP,customer_name,billing_address,EVENT_LABEL
0,338.0,192.0,63.0,499.0,40.0,172.10.245.102,gonzalez.info,Hamiltonborough,63709,Kansas,W,Y,2020-03-29 20:57:21,Melissa Alvarez,3253 Kristin Harbor,legit
1,338.0,238.0,54.0,415.0,62.0,17.194.197.82,brooks.biz,Patrickchester,47923,North Dakota,X,T,2019-12-15 00:37:10,Cody Fowler,9068 Brandy Drive,fraud
2,354.0,151.0,47.0,639.0,41.0,196.15.82.180,smith-aguirre.com,West Ethanmouth,2548,Wyoming,L,B,2020-05-20 08:36:14,Stephanie Martin,9245 Thomas Estates,legit
3,351.0,151.0,53.0,698.0,33.0,155.109.172.212,kelly-jones.com,West Sarahfort,85278,West Virginia,U,Y,2020-05-10 11:57:32,Kimberly Anderson,7257 Stephanie Cliffs,legit
4,403.0,156.0,47.0,651.0,44.0,38.144.137.29,hamilton.biz,New Rhondaland,17058,Nebraska,T,R,2020-01-23 21:15:47,Jacqueline Smith,40977 Marissa Gateway Apt. 992,legit


In [5]:
txn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110000 entries, 0 to 109999
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   registration_days  110000 non-null  float64
 1   recency_days       110000 non-null  float64
 2   visit_count        110000 non-null  float64
 3   spend_amt          110000 non-null  float64
 4   spend_adj_amt      110000 non-null  float64
 5   ip_address         110000 non-null  object 
 6   email_domain       110000 non-null  object 
 7   billing_city       110000 non-null  object 
 8   billing_postal     110000 non-null  int64  
 9   billing_state      110000 non-null  object 
 10  txn_code           110000 non-null  object 
 11  event_code         110000 non-null  object 
 12  EVENT_TIMESTAMP    110000 non-null  object 
 13  customer_name      110000 non-null  object 
 14  billing_address    110000 non-null  object 
 15  EVENT_LABEL        110000 non-null  object 
dtypes:

In [6]:
txn_data['EVENT_LABEL'].value_counts(normalize=True)

legit    0.949509
fraud    0.050491
Name: EVENT_LABEL, dtype: float64

In [7]:
numeric_features = ['registration_days', 'recency_days', 'visit_count', 'spend_amt', 'spend_adj_amt']

In [8]:
from sklearn.model_selection import train_test_split

txn_data_train, txn_data_test = train_test_split(txn_data[numeric_features+['EVENT_LABEL']], test_size=0.2, random_state=42)

In [9]:
txn_data_train['EVENT_LABEL'].value_counts(normalize=True), txn_data_test['EVENT_LABEL'].value_counts(normalize=True)

(legit    0.949807
 fraud    0.050193
 Name: EVENT_LABEL, dtype: float64,
 legit    0.948318
 fraud    0.051682
 Name: EVENT_LABEL, dtype: float64)

In [10]:
txn_data_train_numpy = txn_data_train[numeric_features].values
txn_data_test_numpy = txn_data_test[numeric_features].values

In [11]:
y_true_train = txn_data_train['EVENT_LABEL'].map({'legit':0, 'fraud':1})
y_true_test = txn_data_test['EVENT_LABEL'].map({'legit':0, 'fraud':1})

In [12]:
# Set tree parameters
num_trees = 200
tree_size = 1000

In [13]:
from sagemaker import RandomCutForest

session = sagemaker.Session()

# specify general training job information
rcf = RandomCutForest(role=execution_role,
                      train_instance_count=1,
                      train_instance_type='ml.m4.xlarge',
                      data_location='s3://{}/{}/'.format(bucket, prefix),
                      output_path='s3://{}/{}/output'.format(bucket, prefix),
                      num_samples_per_tree=tree_size,
                      num_trees=num_trees)

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(txn_data_train_numpy))

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-11-09 20:40:13 Starting - Starting the training job...
2020-11-09 20:40:15 Starting - Launching requested ML instances......
2020-11-09 20:41:39 Starting - Preparing the instances for training......
2020-11-09 20:42:43 Downloading - Downloading input data
2020-11-09 20:42:43 Training - Downloading the training image.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing.nosetester import import_nose[0m
  from numpy.testing.decorators import setastest[0m
[34m[11/09/2020 20:43:21 INFO 140519099184960] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_ftp_port': 8999, u'num_samples_per_tree': 256, u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'_kvstore': u'dist_async', u'force_dense': u'true', u'epochs': 1, u'num_trees': 100, u'eval_metrics': [u'accuracy', u'precision_recall_fscore'], u'_num_kv

In [14]:
training_job_name = rcf.latest_training_job.job_name
print('Training job name: {}'.format(training_job_name))

Training job name: randomcutforest-2020-11-09-20-40-13-188


In [15]:
rcf_inference = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!

In [16]:
print('Endpoint name: {}'.format(rcf_inference.endpoint))

Endpoint name: randomcutforest-2020-11-09-20-40-13-188


In [17]:
from sagemaker.predictor import csv_serializer, json_deserializer

rcf_inference.content_type = 'text/csv'
rcf_inference.serializer = csv_serializer
rcf_inference.accept = 'application/json'
rcf_inference.deserializer = json_deserializer

In [18]:
results = rcf_inference.predict(txn_data_train_numpy)
scores = [datum['score'] for datum in results['scores']]

In [19]:
scores = pd.Series(scores, index=txn_data_train.index)
scores.describe()

count    88000.000000
mean         0.702429
std          0.199211
min          0.490593
25%          0.578808
50%          0.642927
75%          0.754937
max          3.218048
dtype: float64

In [20]:
scores_normalized = ((scores - scores.min())
              / (scores.max() - scores.min()))

roc_auc_score(y_true_train, scores_normalized)

0.8538794868152556

In [21]:
results = rcf_inference.predict(txn_data_test_numpy)
scores = [datum['score'] for datum in results['scores']]

In [22]:
scores = pd.Series(scores, index=y_true_test.index)
scores.describe()

count    22000.000000
mean         0.701673
std          0.198797
min          0.488803
25%          0.579630
50%          0.642752
75%          0.753432
max          3.290250
dtype: float64

In [23]:
scores_normalized = ((scores - scores.min())
              / (scores.max() - scores.min()))

roc_auc_score(y_true_test, scores_normalized)

0.8594034179760739

Uncomment the line below if you want to delete the endpoint

In [25]:
#sagemaker.Session().delete_endpoint(rcf_inference.endpoint)