In [1]:
import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-cleaned-sample.parquet")

In [2]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [3]:
train = df[df['timestamp'] <= cutoff ]
len(train)

1748016

In [4]:
test = df[df['timestamp'] > cutoff]
len(test)

751984

In [5]:
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign', 'interarrival'],
      dtype='object')

# Encoding categorical features

We'll start by encoding the categorical features. Within our data set, the categorical features are: 

- merchant id
- transaction type

We will implement one-hot encoding for transaction types and target encoding for merchant id's.

In [6]:
import sklearn
from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(categories=[['chip_and_pin', 'contactless', 'manual', 'online', 'swipe']], sparse=False)
ttype = ohe.fit_transform(X= train[['trans_type']])
ttype



array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

*Now* we can train a target encoder to replace the merchant ID with the proportion of transactions carried out with that merchant which were legitimate. 

In [7]:
from sklearn.preprocessing import LabelEncoder
from category_encoders.target_encoder import TargetEncoder

labelizer =LabelEncoder()
labelizer.fit(y= ['legitimate', 'fraud'])
encoded_labels = labelizer.transform(train['label'])

encoder = TargetEncoder(cols='merchant_id')
encoder.fit( X=train['merchant_id'], y=encoded_labels)
fr_rate = encoder.transform(X=train['merchant_id'])
fr_rate

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,merchant_id
44905449,0.997599
17805936,0.998369
8819718,0.998899
5773423,0.999423
19308700,0.999427
...,...
41838350,1.000000
280831,1.000000
23291970,0.999434
18344979,0.998876


## Encoding Numerical Features
We impute the median interarrival time for the missing interarrival times (corresponding to the first transaction made by each user, since there is no previous transaction there is no interarrival time). We then use a Robust Scaler to scale these.

We also use a robust scaler to transform the transaction amounts.

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

med_imputer = SimpleImputer(strategy="median")
imputed = med_imputer.fit_transform(train[['interarrival']])


robust_scale = RobustScaler()
arr_t = robust_scale.fit_transform(imputed)
arr_t

array([[-0.21220799],
       [-0.22727958],
       [ 0.05681989],
       ...,
       [ 5.53775433],
       [-0.02094951],
       [-0.0369254 ]])

In [9]:
amount_scaler = RobustScaler()
val_t = amount_scaler.fit_transform(train[['amount']])
val_t

array([[ 0.309371  ],
       [ 0.512837  ],
       [-0.7201541 ],
       ...,
       [-0.8305521 ],
       [-0.7528884 ],
       [ 0.03594349]], dtype=float32)

In [10]:
features = pd.DataFrame()

features["amount"]=pd.DataFrame(val_t)[0]
features["arrival"]= arr_t
features["prop"]=fr_rate.reset_index()['merchant_id']
features[["chip_and_pin", "contactless", "manual", "online", "swipe"]]= pd.DataFrame(ttype)
features[["label"]] = train.reset_index().label
features

Unnamed: 0,amount,arrival,prop,chip_and_pin,contactless,manual,online,swipe,label
0,0.309371,-0.212208,0.997599,1.0,0.0,0.0,0.0,0.0,legitimate
1,0.512837,-0.227280,0.998369,1.0,0.0,0.0,0.0,0.0,legitimate
2,-0.720154,0.056820,0.998899,0.0,1.0,0.0,0.0,0.0,legitimate
3,-0.887677,-0.190957,0.999423,0.0,0.0,0.0,1.0,0.0,legitimate
4,0.378049,0.003768,0.999427,1.0,0.0,0.0,0.0,0.0,legitimate
...,...,...,...,...,...,...,...,...,...
1748011,-0.962131,6.338809,1.000000,1.0,0.0,0.0,0.0,0.0,legitimate
1748012,1.417202,-0.210550,1.000000,0.0,0.0,0.0,0.0,1.0,legitimate
1748013,-0.830552,5.537754,0.999434,1.0,0.0,0.0,0.0,0.0,legitimate
1748014,-0.752888,-0.020950,0.998876,0.0,0.0,0.0,1.0,0.0,legitimate


# Visualising the data

### Principal Component Analysis (PCA)

In [11]:
from sklearn.decomposition import PCA
pca_proj= PCA(2)

In [12]:
%%time
pca_a=pca_proj.fit_transform(features.drop(columns = 'label'))

CPU times: user 4.45 s, sys: 6.53 s, total: 11 s
Wall time: 3.15 s


In [13]:
pca_a

array([[-0.55071369, -1.20855529],
       [-0.34726091, -1.22380468],
       [-1.57998956, -0.93828191],
       ...,
       [-1.68554555,  4.54236256],
       [-1.61285411, -1.01871682],
       [-0.82397531, -1.03269681]])

In [14]:
pca_vis = pd.DataFrame({"x": pca_a.T[0],
                        "y": pca_a.T[1], 
                        "label": features['label']}).dropna()
pca_vis

Unnamed: 0,x,y,label
0,-0.550714,-1.208555,legitimate
1,-0.347261,-1.223805,legitimate
2,-1.579990,-0.938282,legitimate
3,-1.747793,-1.188601,legitimate
4,-0.481844,-0.992637,legitimate
...,...,...,...
1748011,-1.816415,5.343528,legitimate
1748012,0.557089,-1.208331,legitimate
1748013,-1.685546,4.542363,legitimate
1748014,-1.612854,-1.018717,legitimate


In [15]:
pca_vis_sample = pd.concat([pca_vis[pca_vis["label"] == label].sample(2500) for label in ["legitimate", "fraud"]])

In [16]:
import altair as alt
alt.Chart(pca_vis_sample).mark_point(opacity=0.3).encode(
    x="x:Q", 
    y="y:Q", 
    color="label"
).properties(width=500, height=500).interactive()

### t-distributed stochastic neighbor embedding (t-SNE)

In [17]:
from sklearn.manifold import TSNE

tsne = TSNE()

In [18]:
%%time

feature_sample = pd.concat([features[features["label"] == label].sample(50000, replace=True) for label in ["legitimate", "fraud"]]).sample(50000)
tsne_a = tsne.fit_transform(feature_sample.drop(columns=['label']))

CPU times: user 6min 26s, sys: 5.67 s, total: 6min 31s
Wall time: 2min 12s


In [19]:
tsne_df = pd.DataFrame(tsne_a, columns=["x", "y"])
tsne_df["label"] = feature_sample.reset_index()["label"]

In [20]:
fraud_cases = tsne_df[tsne_df["label"] == "fraud"]
legit_cases = tsne_df[tsne_df["label"] == "legitimate"]

fraud_case_count = len(fraud_cases)

In [21]:
fraud_case_count

24926

In [22]:
alt.data_transformers.enable('default')
alt.Chart(tsne_df.sample(5000)).mark_point(opacity=0.3).encode(
    x="x:Q", 
    y="y:Q", 
    color="label"
).properties(width=500, height=500).interactive()

## Uniform Manifold Approximation and Projection (UMAP)

In [23]:
import umap
trained_UMAP = umap.UMAP()

In [24]:
%%time
umap_a = trained_UMAP.fit_transform(feature_sample.drop(columns=['label']))

CPU times: user 14min 21s, sys: 7min 14s, total: 21min 35s
Wall time: 5min 31s


In [25]:
umap_df = pd.DataFrame(umap_a, columns=["x", "y"])
umap_df["label"] = feature_sample.reset_index()["label"]

In [26]:
alt.data_transformers.enable('default')
alt.Chart(umap_df.sample(5000)).mark_point(opacity=0.3).encode(
    x="x:Q", 
    y="y:Q", 
    color="label"
).properties(width=500, height=500).interactive()

# Training a model

In [27]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=16, max_depth=8, random_state=404, class_weight="balanced_subsample")



In [28]:
%%time
rfc.fit(features.drop(columns='label'), train["label"])

CPU times: user 27.4 s, sys: 0 ns, total: 27.4 s
Wall time: 27.4 s


RandomForestClassifier(class_weight='balanced_subsample', max_depth=8,
                       n_estimators=16, random_state=404)

In [29]:
%%time
rfc.predict(features.drop(columns='label'))

CPU times: user 1.46 s, sys: 0 ns, total: 1.46 s
Wall time: 1.46 s


array(['legitimate', 'legitimate', 'legitimate', ..., 'legitimate',
       'legitimate', 'legitimate'], dtype=object)

## Putting it together with pieplines

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

label_encoder = ('label_encoder', OneHotEncoder(drop='first', categories=[[ 'fraud', 'legitimate']],  sparse=False), ['label'])
merchant_passthrough = ('merchant_passthrough', 'passthrough', ['merchant_id'])
tg_encoder = ('merchant_encoder', TargetEncoder(cols='merchant_id'), ['merchant_id'])

ct = ColumnTransformer(transformers=([label_encoder, merchant_passthrough]))

smol_ohe = OneHotEncoder(drop='first', categories=[[ 'fraud', 'legitimate']],  sparse=False)


In [31]:
lbl_scaler = ('lable_encoder', OneHotEncoder(drop='first', categories=[[ 'fraud', 'legitimate']],  sparse=False), ['label'])
lbl_xform = ColumnTransformer([lbl_scaler])
lbl_xform.fit(train)


tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['chip_and_pin', 'contactless', 'manual', 'online', 'swipe']], sparse=False), ['trans_type'])
tg_encoder = ('merchant_encoder', TargetEncoder(cols='merchant_id'), ['merchant_id'])

xform_steps = [tt_xform, tg_encoder]
impute_and_scale = Pipeline([('median_imputer', SimpleImputer(strategy="median")), ('interarrival_scaler', RobustScaler())])
ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])
amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])

scale_steps = [ia_scaler, amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))

In [32]:
feat_pipeline = Pipeline([
    ('feature_extraction',all_xforms),
    ('model training', RandomForestClassifier(n_estimators=16, max_depth=8, random_state=404, class_weight="balanced_subsample"))

])

In [33]:
%%time
feat_pipeline.fit(train, y =lbl_xform.transform(train) )

  elif pd.api.types.is_categorical(cols):
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


CPU times: user 27.5 s, sys: 0 ns, total: 27.5 s
Wall time: 27.5 s


Pipeline(steps=[('feature_extraction',
                 ColumnTransformer(transformers=[('interarrival_scaler',
                                                  Pipeline(steps=[('median_imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('interarrival_scaler',
                                                                   RobustScaler())]),
                                                  ['interarrival']),
                                                 ('amount_scaler',
                                                  RobustScaler(), ['amount']),
                                                 ('onehot',
                                                  OneHotEncoder(categories=[['chip_and_pin',
                                                                             'contactless',
                                                                 

In [34]:
%%time
feat_pipeline.predict(train)

CPU times: user 1.87 s, sys: 0 ns, total: 1.87 s
Wall time: 1.87 s


array([1., 1., 1., ..., 1., 1., 1.])

In [35]:
%%time
feat_pipeline.predict(test)

CPU times: user 882 ms, sys: 0 ns, total: 882 ms
Wall time: 882 ms


array([1., 1., 1., ..., 1., 1., 1.])

In [36]:
from cloudpickle import pickle
with open("cpu_pipeline.sav", "wb+") as sf:
  pickle.dump(feat_pipeline, sf)