In [1]:
import cudf
import cuml

## Load the data and split into training and test sets

In [2]:
df = cudf.read_parquet("fraud-cleaned-sample.parquet")  

In [3]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [4]:
train = df[df['timestamp'] <= cutoff ]
len(train)

1748016

In [5]:
test = df[df['timestamp'] > cutoff]
len(test)

751984

In [6]:
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign', 'interarrival'],
      dtype='object')

# Encoding categorical features
We'll start by encoding the categorical features. Within our data set, the categorical features are: 

- merchant id
- transaction type

We will implement one-hot encoding for transaction types and target encoding for merchant id's.

In [7]:
from cuml.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories=cudf.DataFrame(['chip_and_pin', 'contactless', 'manual', 'online', 'swipe'], columns=['trans_type']), sparse=False)
ttype = ohe.fit_transform(X= train[['trans_type']])
ttype

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

*Now* we can train a target encoder to replace the merchant ID with the proportion of transactions carried out with that merchant which were legitimate. 

In [8]:
from cuml.preprocessing.LabelEncoder import LabelEncoder
from cuml.preprocessing.TargetEncoder import TargetEncoder

labelizer = LabelEncoder()
labelizer.fit(y=cudf.Series(['legitimate', 'fraud']))
encoded_labels = labelizer.transform(train["label"])

encoder = TargetEncoder()
fr_rate = encoder.fit_transform(y=encoded_labels, x=train["merchant_id"].values)
fr_rate

array([0.99678198, 0.99784637, 0.99852071, ..., 1.        , 0.99924981,
       0.99848714])

## Encoding Numerical Features


We impute the median interarrival time for the missing interarrival times (corresponding to the first transaction made by each user, since there is no previous transaction there is no interarrival time). We then use a Robust Scaler to scale these. 

We also use a robust scaler to transform the transaction amounts. 

In [9]:
from cuml.experimental.preprocessing import SimpleImputer
from cuml.experimental.preprocessing import RobustScaler

med_imputer = SimpleImputer(strategy="median")
imputed = med_imputer.fit_transform(train[['interarrival']])

robust_scale = RobustScaler()
arr_t = robust_scale.fit_transform(imputed)

In [10]:
amount_scaler = RobustScaler()
val_t = amount_scaler.fit_transform(train[['amount']])

In [11]:
features = cudf.DataFrame()

features["amount"]=val_t
features["arrival"]= arr_t
features["prop"]=fr_rate
features[["chip_and_pin", "contactless", "manual", "online", "swipe"]]= cudf.DataFrame(ttype)
features

Unnamed: 0,amount,arrival,prop,chip_and_pin,contactless,manual,online,swipe
0,0.309371,-0.212208,0.996782,1.0,0.0,0.0,0.0,0.0
1,0.512837,-0.227280,0.997846,1.0,0.0,0.0,0.0,0.0
2,-0.720154,0.056820,0.998521,0.0,1.0,0.0,0.0,0.0
3,-0.887677,-0.190957,0.999237,0.0,0.0,0.0,1.0,0.0
4,0.378049,0.003768,0.999224,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1748011,-0.962131,6.338809,1.000000,1.0,0.0,0.0,0.0,0.0
1748012,1.417202,-0.210550,1.000000,0.0,0.0,0.0,0.0,1.0
1748013,-0.830552,5.537754,1.000000,1.0,0.0,0.0,0.0,0.0
1748014,-0.752888,-0.020950,0.999250,0.0,0.0,0.0,1.0,0.0


# Visualising the data

In [12]:
from cuml import UMAP

In [13]:
%%time
umap_all = UMAP().fit(features)

CPU times: user 34.1 s, sys: 31.5 s, total: 1min 5s
Wall time: 1min 5s


In [14]:
sample_size = len(features) // 10
sample_size

174801

In [15]:
%%time
umap_xform_small = UMAP().fit(features.sample(50000))

CPU times: user 165 ms, sys: 96.3 ms, total: 262 ms
Wall time: 261 ms


In [16]:
%%time
umap_xform = UMAP().fit(features.sample(sample_size))

CPU times: user 927 ms, sys: 116 ms, total: 1.04 s
Wall time: 1.04 s


In [17]:
trained_UMAP = umap_xform.transform(features)
trained_UMAP["label"]=train[['label']].reset_index()[['label']]
trained_UMAP = trained_UMAP.rename(columns={0: "x", 1: "y"})
pd_umap = trained_UMAP.to_pandas()

In [18]:
import pandas as pd

vis_sample = pd.concat([pd_umap[pd_umap["label"] == label].sample(2500) for label in ["legitimate", "fraud"]])
vis_sample

Unnamed: 0,x,y,label
389060,-3.139834,-7.573643,legitimate
1473876,-5.477855,-7.679142,legitimate
973767,-7.917494,-7.304850,legitimate
325694,4.012187,-5.437031,legitimate
233195,-0.989968,-11.639940,legitimate
...,...,...,...
755807,0.124366,0.404422,fraud
820212,12.405904,3.724119,fraud
198488,2.991401,-11.276748,fraud
1342227,4.472794,9.977864,fraud


In [19]:
import altair as alt
alt.Chart(vis_sample).mark_point(opacity=0.3).encode(
    x="x:Q", 
    y="y:Q", 
    color="label"
).properties(width=500, height=500).interactive()


# Training a model



In [20]:
from cuml.ensemble import RandomForestClassifier as RFC

In [21]:
rfc = RFC(n_estimators=16, max_depth=8, random_state=404)


  """Entry point for launching an IPython kernel.


In [22]:
%%time
rfc.fit(X=features.astype("float32"), y=encoded_labels.astype("float32"))

CPU times: user 321 ms, sys: 50.2 ms, total: 371 ms
Wall time: 84.3 ms


RandomForestClassifier(split_criterion=0, handle=<cuml.raft.common.handle.Handle object at 0x7fc93790ee90>, verbose=4, output_type='input')

In [23]:
%%time
rfc.predict(X=features.astype("float32"), predict_model='GPU')

CPU times: user 892 ms, sys: 0 ns, total: 892 ms
Wall time: 187 ms


0          1.0
1          1.0
2          1.0
3          1.0
4          1.0
          ... 
1748011    1.0
1748012    1.0
1748013    1.0
1748014    1.0
1748015    1.0
Length: 1748016, dtype: float32

# Putting it all together with Pipelines

In [24]:
import cupy

from pipelines import SerialPipelineNode, PrefittedPipelineNode, PassThroughPipelineNode, CombiningPipelineNode, none


In [25]:
def get_thelabel(d):
  return d['label']

def get_merchant_id(d):
  return d['merchant_id']

def get_ttype(d):
  return d[['trans_type']]

def get_amount(d):
  return d[['amount']]


In [26]:

labelizer = LabelEncoder()
labelizer.fit(y=cudf.Series(['legitimate', 'fraud']))
t_types = cudf.DataFrame(['chip_and_pin', 'contactless', 'manual', 'online', 'swipe'], columns=['trans_type'])

merchant_te = CombiningPipelineNode({
    'labels' : SerialPipelineNode(get_thelabel, [PrefittedPipelineNode(labelizer)]),
    'merchants' : PassThroughPipelineNode(get_merchant_id),
}, xkeys=['merchants'], ykey='labels', combiner=TargetEncoder())

cpn_mega = CombiningPipelineNode({
    'labels' : SerialPipelineNode(get_thelabel, [PrefittedPipelineNode(labelizer)]),
    'merchants' : merchant_te,
    'categoricals': SerialPipelineNode(get_ttype, [OneHotEncoder(categories=t_types, sparse=False)]),
    'amount': SerialPipelineNode(get_amount, [RobustScaler()]),
}, xkeys=['merchants'], ykey='labels', combiner=RFC(), cast="float32", dfclass=cudf.DataFrame)


In [27]:
%%time
cpn_mega.fit(train)

CPU times: user 1.35 s, sys: 147 ms, total: 1.49 s
Wall time: 601 ms


RandomForestClassifier(split_criterion=0, handle=<cuml.raft.common.handle.Handle object at 0x7fc9366e1230>, verbose=4, output_type='input')

In [28]:
%%time
cpn_mega.predict(test)

CPU times: user 1.27 s, sys: 150 ms, total: 1.42 s
Wall time: 462 ms


0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
751979    1.0
751980    1.0
751981    1.0
751982    1.0
751983    1.0
Length: 751984, dtype: float32

In [29]:
from cloudpickle import pickle
with open("cpn.sav", "wb+") as sf:
  pickle.dump(cpn_mega, sf)