In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3
import re
import sagemaker
import seaborn as sns


pd.options.display.max_rows = 100
pd.options.display.float_format = '{:.3f}'.format
pd.options.display.max_columns = 100


role = sagemaker.get_execution_role()

#Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
#manipulating entities and resources that Amazon SageMaker uses, such as training jobs, endpoints, and input datasets in S3.
sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = 'DEMO-random-cut-forest'

## Import the Dataset

In [22]:
%%time

import pandas as pd
import urllib.request

data_filename = './data/creditcard.csv'


credit_df = pd.read_csv(data_filename, delimiter=',')


inliers = credit_df[credit_df.Class==0]
inliers = inliers.drop(['Class'], axis=1)

outliers = credit_df[credit_df.Class==1]
outliers_class = credit_df[credit_df.Class==1]['Class']
outliers = outliers.drop(['Class'], axis=1)

credit_df = credit_df.drop(['Class'], axis=1)

CPU times: user 2.25 s, sys: 94.6 ms, total: 2.34 s
Wall time: 2.39 s


## Store Data on S3
The Random Cut Forest Algorithm accepts data in RecordIO Protobuf format. The SageMaker Python API provides helper functions for easily converting your data into this format. Below we convert the temperature sensor data and upload it to the bucket + prefix Amazon S3 destination specified at the beginning of this notebook in the Setup AWS Credentials section.

In [5]:
def save_s3_file_csv(bucket, file_data, dataset, header=None, index=None):
    csv_buffer = io.StringIO()
    dataset.to_csv(csv_buffer, header=header, index=index)
    s3_client = boto3.client('s3')
    response = s3_client.put_object(Bucket=bucket, Key=file_data, Body=csv_buffer.getvalue())
    return response

## Defining the training job and creating the estimator

In [23]:
from sagemaker import RandomCutForest
import sagemaker
import os, io


train_channel = os.path.join(prefix, 'train')

feature_dim = credit_df.shape[1]

# Save data at training folder
response = save_s3_file_csv(bucket, prefix, inliers, 
                 header=None, 
                 index=None)


rcf = RandomCutForest(role=role,
                      train_instance_count=1,
                      train_instance_type='ml.m4.xlarge',
                      data_location='s3://{}/{}/train_rcf/'.format(bucket, prefix), # Folder with '/'
                      output_path='s3://{}/{}/output'.format(bucket, prefix),
                      num_samples_per_tree=300,
                      num_trees=200) # Min number of tree

## Start the training process on Amazon SageMaker

In [24]:
rcf.fit(rcf.record_set(inliers.values))

2019-10-21 10:19:54 Starting - Starting the training job...
2019-10-21 10:19:59 Starting - Launching requested ML instances.........
2019-10-21 10:21:31 Starting - Preparing the instances for training......
2019-10-21 10:22:29 Downloading - Downloading input data...
2019-10-21 10:23:24 Training - Training image download completed. Training in progress..[31mDocker entrypoint called with argument(s): train[0m
  from numpy.testing.nosetester import import_nose[0m
  from numpy.testing.decorators import setastest[0m
[31m[10/21/2019 10:23:28 INFO 140115100460864] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_ftp_port': 8999, u'num_samples_per_tree': 256, u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'_kvstore': u'dist_async', u'force_dense': u'true', u'epochs': 1, u'num_trees': 100, u'eval_metrics': [u'accuracy', u'precision_recall_fscore'], u'_num_kv_servers': u'auto', u'mini_bat

[31m[10/21/2019 10:23:31 INFO 140115100460864] Master node: Serializing the RandomCutForest model[0m
[31m#metrics {"Metrics": {"serialize_model.time": {"count": 1, "max": 1633.1660747528076, "sum": 1633.1660747528076, "min": 1633.1660747528076}}, "EndTime": 1571653411.601644, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "RandomCutForest"}, "StartTime": 1571653409.968425}
[0m
[31m[10/21/2019 10:23:31 INFO 140115100460864] Test data is not provided.[0m
[31m[I 19-10-21 10:23:31] >>> shutting down FTP server (0 active fds) <<<[0m
[31m[10/21/2019 10:23:31 INFO 140113487034112] >>> shutting down FTP server (0 active fds) <<<[0m
[31m#metrics {"Metrics": {"totaltime": {"count": 1, "max": 3655.3308963775635, "sum": 3655.3308963775635, "min": 3655.3308963775635}, "setuptime": {"count": 1, "max": 212.69989013671875, "sum": 212.69989013671875, "min": 212.69989013671875}}, "EndTime": 1571653411.730252, "Dimensions": {"Host": "algo-1", "Operation": "training", "A

## Deploying the model

In [25]:
predictor = rcf.deploy(initial_instance_count=1,
                       instance_type='ml.t2.medium', 
                       wait=True)

print('Deployment completed')

--------------------------------------------------------------------------------------------------------------------------!Deployment completed


## Evaluating the Model

In [92]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.accept = 'application/json'
predictor.deserializer = json_deserializer

results = predictor.predict(outliers.values)
scores = np.array([datum['score'] for datum in results['scores']])

#Calculate the score cutoff threshold
score_mean = scores.mean()
score_std = scores.std()
score_cutoff = 0.76

print(score_cutoff)
anomalies = scores[scores > score_cutoff]
anomaly_indices = np.arange(len(scores))[scores > score_cutoff]

0.76


## Evaluating the model

In [93]:
print("Accuracy in Detecting Legit Cases:", len(anomalies)/len(outliers) * 100, '%')

Accuracy in Detecting Legit Cases: 91.46341463414635 %
