In [1]:
import pandas as pd
import numpy as np

We'll start by reading in our fraud dataset and looking at the column names:

In [2]:
df = pd.read_parquet("fraud-cleaned-sample.parquet")
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign', 'interarrival'],
      dtype='object')

# Transaction type distribution

In [3]:
pt = pd.pivot_table(df[["label", "trans_type", "timestamp"]], 
                    index=["label", "trans_type"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['percentage'] = gdf['count'] / gdf['total']

gdf

Unnamed: 0,label,trans_type,count,total,percentage
0,fraud,chip_and_pin,2293,46396,0.049422
1,fraud,contactless,2346,46396,0.050565
2,fraud,manual,16124,46396,0.34753
3,fraud,online,23335,46396,0.502953
4,fraud,swipe,2298,46396,0.04953
5,legitimate,chip_and_pin,491303,2453604,0.200237
6,legitimate,contactless,613165,2453604,0.249904
7,legitimate,manual,122175,2453604,0.049794
8,legitimate,online,858969,2453604,0.350085
9,legitimate,swipe,367992,2453604,0.14998


In [4]:
import altair as alt

alt.Chart(gdf).mark_bar().encode(
    alt.Y('percentage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x="label", color='label'
)

# Foreign transaction distribution

In [5]:
pt = pd.pivot_table(df[["label", "foreign", "timestamp"]], 
                    index=["label", "foreign"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x="label", color='label'
)

# Transaction amount distribution

In [6]:
%%time
qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
qs

CPU times: user 3.27 s, sys: 91.9 ms, total: 3.36 s
Wall time: 3.38 s


Unnamed: 0_level_0,Unnamed: 1_level_0,amount
label,Unnamed: 1_level_1,Unnamed: 2_level_1
fraud,0.01,5.33
fraud,0.05,6.72
fraud,0.1,8.89
fraud,0.25,14.66
fraud,0.5,22.530001
fraud,0.75,28.0
fraud,0.9,35.209999
fraud,0.95,40.652501
fraud,0.99,52.810001
legitimate,0.01,3.31


In [7]:
qdf = pd.DataFrame(qs.to_records())

alt.Chart(qdf).mark_line(interpolate="monotone").encode(
    alt.Y("amount", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')), 
    alt.X("level_0", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
)

# Interarrival times

In [13]:
fraudsamp = df[df["label"] == "fraud"].copy()
legitsamp = df[df["label"] == "legitimate"].sample(len(fraudsamp)).copy()

fraudsamp['irank'] = fraudsamp['interarrival'].rank(pct=True, method="dense")
legitsamp['irank'] = legitsamp['interarrival'].rank(pct=True, method="dense")
qdf = pd.concat([fraudsamp.groupby(['label', 'interarrival', 'irank']).size(), legitsamp.groupby(['label', 'interarrival', 'irank']).size()])
qdf = pd.DataFrame(pd.DataFrame(qdf).to_records())
qdf = qdf[qdf['interarrival'] > 0]

In [14]:
alt.Chart(qdf.sample(5000)).mark_line().interactive().encode(
    alt.Y("interarrival", axis=alt.Axis(title='interarrival time'), scale=alt.Scale(type='log')), 
    alt.X("irank", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
)