# Fraud Detection based on Synthetic Financial Datasets - SageMaker Logistic Regression

##### By: Ling Jiang, Xinyue Jin, Ming Ki Toby Cheng

## 0) Startup

### 0.a) Import packages

In [1]:
import boto3, os, sagemaker
from sagemaker import get_execution_role
import warnings
warnings.filterwarnings("ignore")

### 0.b) Get Amazon IAM execution role and instance region

In [2]:
# Define IAM role- this will be necessary when defining your model
iam_role = get_execution_role()

# set the region of the instance and get a reference to the client
my_region = boto3.Session().region_name

# sagemaker session handle
sm_session = sagemaker.session.Session(boto3.Session())

print("Success - the SageMaker instance is in the " + my_region + " region")

Success - the SageMaker instance is in the us-west-2 region


## 1) Load and examine data


### 1.a) Load CSV data from s3, and the dictionary pickle from local


In [3]:
import pandas as pd
import pickle

bucket = "bigdat2proj"
prefix = "data"

df = pd.read_csv("s3://{}/{}/PS_20174392719_1491204439457_log.csv".format(bucket,prefix))

### 1.b) Examine data schema and counts

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


### 1.c) Create new features & change data type


In [11]:
df[df['isFraud']==1].head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
251,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0
252,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,0
680,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1,0


In [16]:
df["OrigDiff"] = df['newbalanceOrig'] - df['oldbalanceOrg']
df["DestDiff"] = df['newbalanceDest'] - df['oldbalanceDest']
df["AmountEqOrig"] = (df['amount'] == df['oldbalanceOrg']).astype(int)
df["CusToCus"] = (df['nameOrig'].str.contains("C") & df['nameDest'].str.contains("C")).astype(int)

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,OrigDiff,DestDiff,AmountEqOrig,CusToCus
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0,1,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0,1,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0,0,0


In [17]:
# one-hot coding
df['payment'] = (df["type"] == "PAYMENT").astype(int)
df['transfer'] = (df["type"] == "TRANSFER").astype(int)
df['cash-out'] = (df["type"] == "CASH_OUT").astype(int)
df['debit'] = (df["type"] == "DEBIT").astype(int)
df['cash-in'] = (df["type"] == "CASH_IN").astype(int)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,OrigDiff,DestDiff,AmountEqOrig,CusToCus,payment,transfer,cash-out,debit,cash-in
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0,0,0,1,0,0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0,0,0,1,0,0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0,1,1,0,1,0,0,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0,1,1,0,0,1,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0,0,0,1,0,0,0,0


### 1.d) Select columns


In [18]:
colNames = ['payment', 'transfer', 'cash-out','debit','cash-in',
                'amount', 'OrigDiff', 'DestDiff', 'AmountEqOrig', 'CusToCus', 'isFraud']


In [19]:
# select columns
df_selected = df[colNames]
df_selected.head()

Unnamed: 0,payment,transfer,cash-out,debit,cash-in,amount,OrigDiff,DestDiff,AmountEqOrig,CusToCus,isFraud
0,1,0,0,0,0,9839.64,-9839.64,0.0,0,0,0
1,1,0,0,0,0,1864.28,-1864.28,0.0,0,0,0
2,0,1,0,0,0,181.0,-181.0,0.0,1,1,1
3,0,0,1,0,0,181.0,-181.0,-21182.0,1,1,1
4,1,0,0,0,0,11668.14,-11668.14,0.0,0,0,0


## 2) Prepare data

In [20]:
featureCols = ['payment', 'transfer', 'cash-out','debit','cash-in',
                'amount', 'OrigDiff', 'DestDiff', 'AmountEqOrig', 'CusToCus']
labelCol = 'isFraud'

df_cleaned = df_selected[featureCols]

df_cleaned[labelCol] = df_selected[labelCol]

df_cleaned = df_cleaned.dropna()

print("Number of features is ",len(featureCols))

print("Number of rows in cleaned DataFrame",df_cleaned.shape[0])

Number of features is  10
Number of rows in cleaned DataFrame 6362620


In [21]:
# Make sure label columns is the first column
# Linear Learner expects labels to be in the first column
cols = df_cleaned.columns.tolist()
cols.insert(0, cols.pop(cols.index('isFraud')))
df_cleaned2 = df_cleaned.reindex(columns = cols)

In [22]:
df_cleaned2.head()

Unnamed: 0,isFraud,payment,transfer,cash-out,debit,cash-in,amount,OrigDiff,DestDiff,AmountEqOrig,CusToCus
0,0,1,0,0,0,0,9839.64,-9839.64,0.0,0,0
1,0,1,0,0,0,0,1864.28,-1864.28,0.0,0,0
2,1,0,1,0,0,0,181.0,-181.0,0.0,1,1
3,1,0,0,1,0,0,181.0,-181.0,-21182.0,1,1
4,0,1,0,0,0,0,11668.14,-11668.14,0.0,0,0


### 2.d) Save preprocessed data to s3

In [23]:
# shuffle data and train/val/test split, at 6:2:2

import numpy as np

x_train_df, x_val_df, x_test_df = \
    np.split(df_cleaned2.sample(frac=1, random_state=1729), 
             [int(0.6 * len(df_cleaned2)), int(0.8*len(df_cleaned2))]) 

x_train = x_train_df.values
x_val = x_val_df.values
x_test = x_test_df.values[:,1:]  # remove the label column

In [24]:
# save preprocessed data to s3 

bucket = "bigdat2proj"       # CHANGE ME: your own bucket
prefix = "data"   # CHANGE ME: your own folder

datasets = [('train', x_train), ('val', x_val), ('test', x_test)]  # Note: also using val as test
data_locations = []
for key, x in datasets:
    file = "{}/{}_data/data.csv".format(prefix,key)
    location = "s3://{}/{}".format(bucket, file); data_locations += [location]
    print('{} data will be uploaded to: {}'.format(key, location))
    np.savetxt("{}.csv".format(key), x, fmt = '%s',delimiter=',')      #     delimiter=','
    boto3.Session().resource('s3').Bucket(bucket).Object(file).upload_file("{}.csv".format(key))


train data will be uploaded to: s3://bigdat2proj/data/train_data/data.csv
val data will be uploaded to: s3://bigdat2proj/data/val_data/data.csv
test data will be uploaded to: s3://bigdat2proj/data/test_data/data.csv


In [25]:
# set model output location
output_location = "s3://{}/{}/output".format(bucket,prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://bigdat2proj/data/output


## 3) Train a Logistic model with static hyperparameters

### 3.1) Train the model

In [26]:
# create input channels to stream data
# NOTE: in the case of using CSV files, all data is saved to the worker containers before training starts;
#       only libsvm supports real data streaming

train_channel = sagemaker.inputs.s3_input(data_locations[0], content_type='text/csv')
val_channel = sagemaker.inputs.s3_input(data_locations[1], content_type='text/csv')

data_channels = {'train': train_channel, 'validation': val_channel}

In [27]:
# create the estimator instance

from sagemaker.amazon.amazon_estimator import get_image_uri
training_image = get_image_uri(boto3.Session().region_name, 'linear-learner')
smclient = boto3.Session().client('sagemaker')

ll_model = sagemaker.estimator.Estimator(training_image,
                                          iam_role, 
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.xlarge',
                                          train_volume_size = 5,
                                          output_path=output_location,
                                          sagemaker_session=sagemaker.Session())

In [28]:
# Set estimator hyperparameters

ll_model.set_hyperparameters(feature_dim=10, predictor_type='binary_classifier')

In [29]:
# Fit using the data channels

ll_model.fit(data_channels)

2020-03-16 02:37:32 Starting - Starting the training job...
2020-03-16 02:37:34 Starting - Launching requested ML instances......
2020-03-16 02:38:33 Starting - Preparing the instances for training...
2020-03-16 02:39:20 Downloading - Downloading input data...
2020-03-16 02:39:55 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/16/2020 02:39:57 INFO 140319822792512] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step

In [32]:
# print the model's S3 URL
ll_model.model_data

's3://bigdat2proj/data/output/linear-learner-2020-03-16-02-37-32-561/output/model.tar.gz'

### 3.2) Deploy model for interactive inference

1. Spawn a Real Time Predictor using Estimator.deploy()
2. Run real time inference for `test.csv`


In [223]:
# Deploy the real time predictor on Amazon servers. This will take several minutes. 
ll_predictor = ll_model.deploy(initial_instance_count=1,
                                 content_type='text/csv',
                                 instance_type='ml.t2.xlarge')

Using already existing model: linear-learner-2020-03-16-02-37-32-561


---------------!

In [225]:
from sagemaker.predictor import csv_serializer, json_deserializer

ll_predictor.content_type = 'text/csv'
ll_predictor.serializer = csv_serializer
ll_predictor.deserializer = json_deserializer

In [236]:
def predict_file(input_path, predictor, limit=1000):
    """ 
    Utility for piping a local test file to Sagemaker's Real Time Predictor interface 
    Returns a numpy array of integers as the predicted labels
    
    input_path: path to the input CSV file. This should be a local file, you can use "test.csv"
    predictor: the predictor object we created for you
    limit: the max number of data points to send to the predictor. We have set this to 1000 for you. 
    """
    
    res = []
    with open(input_path, 'r') as f:
        lines = f.readlines()
    for i, l in enumerate(lines):
        res += [predictor.predict(l)['predictions']]
        print("Progress: {}/{}".format(i, len(lines)), end='\r')
        if limit is not None and i >= limit-1:
            break
    y_pred = np.array([e[0]['predicted_label'] for e in res]).astype(int)
    
    return y_pred

In [None]:
# run inference and get reports
from sklearn.metrics import confusion_matrix, classification_report

# Hint: Use the predict_file() method we have provided

y_pred = predict_file("test.csv",ll_predictor,limit = None)

Progress: 1272523/1272524

In [None]:
y_true = x_test_df.iloc[0:,0]

In [None]:
len(y_true)

1272524

In [None]:
confusion_matrix(y_true,y_pred)

array([[1270820,      12],
       [      8,    1684]])

In [None]:
# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(1270820, 12, 8, 1684)

In [None]:
# Classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270832
           1       0.99      1.00      0.99      1692

   micro avg       1.00      1.00      1.00   1272524
   macro avg       1.00      1.00      1.00   1272524
weighted avg       1.00      1.00      1.00   1272524



In [None]:
# F1 score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
f1 = metrics.fbeta_score(y_true, y_pred, beta = 1)
f1

0.9940968122786304

In [None]:
fbeta = metrics.fbeta_score(y_true, y_pred, beta = 2)
fbeta

0.9948015122873347

In [None]:
# AUC
roc_auc_score(y_true, y_pred)

0.9976312124897176

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
roc_auc = metrics.auc(fpr, tpr) # compute area under the curve

In [None]:
# delete the endpoint after inference to save cost; you may also delete the model.
ll_predictor.delete_endpoint()
# xgb_predictor.delete_model()