In [1]:
import pandas as pd
import numpy as np

We'll start by reading in our fraud dataset and looking at the column names:

In [2]:
df = pd.read_csv("fraud.csv")
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign'],
      dtype='object')

# Transaction type distribution

In [3]:
pt = pd.pivot_table(df[["label", "trans_type", "timestamp"]], 
                    index=["label", "trans_type"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

Unnamed: 0,label,trans_type,count,total,pctage
0,fraud,chip_and_pin,21007,412839,0.050884
1,fraud,contactless,20678,412839,0.050087
2,fraud,manual,144390,412839,0.349749
3,fraud,online,206163,412839,0.499379
4,fraud,swipe,20601,412839,0.049901
5,legitimate,chip_and_pin,4507746,22533292,0.200048
6,legitimate,contactless,5631831,22533292,0.249934
7,legitimate,manual,1128292,22533292,0.050072
8,legitimate,online,7886722,22533292,0.350003
9,legitimate,swipe,3378701,22533292,0.149943


In [4]:
import altair as alt

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x="label", color='label'
)

# Foreign transaction distribution

In [5]:
pt = pd.pivot_table(df[["label", "foreign", "timestamp"]], 
                    index=["label", "foreign"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x="label", color='label'
)

# Transaction amount distribution

In [26]:
%%time
qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
qs

CPU times: user 1min 21s, sys: 2.47 s, total: 1min 24s
Wall time: 1min 59s


Unnamed: 0_level_0,Unnamed: 1_level_0,amount
label,Unnamed: 1_level_1,Unnamed: 2_level_1
fraud,0.01,5.35
fraud,0.05,6.74
fraud,0.1,8.98
fraud,0.25,14.71
fraud,0.5,22.61
fraud,0.75,28.17
fraud,0.9,35.33
fraud,0.95,40.69
fraud,0.99,53.04
legitimate,0.01,3.31


In [99]:
qdf = pd.DataFrame(qs.to_records())
alt.Chart(qdf).mark_line(interpolate="monotone").encode(
    alt.Y("amount", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')), 
    alt.X("level_0", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
)

# Interarrival times

In [59]:
trans_times = df[['timestamp', 'label', 'user_id']].sort_values(['user_id', 'timestamp'])
trans_times['date'] = pd.to_datetime(trans_times['timestamp'], unit='s')

In [62]:
trans_times.groupby(['user_id', 'label']).diff()

Unnamed: 0,timestamp,date
29775,,NaT
36379,9426.0,02:37:06
42580,8769.0,02:26:09
48212,8490.0,02:21:30
54290,8942.0,02:29:02
...,...,...
22902733,44815.0,12:26:55
22910628,11438.0,03:10:38
22917465,9991.0,02:46:31
22924314,9837.0,02:43:57


In [88]:
interarrivals = trans_times.set_index(['user_id', 'label', 'date']).diff().reset_index()
interarrivals.columns = ['user_id', 'label', 'date', 'interarrival']

In [208]:
fraudsamp = interarrivals[(interarrivals['label'] == 'fraud') & interarrivals['interarrival'] > 0].sample(100000)
legitsamp = interarrivals[(interarrivals['label'] == 'legitimate') & interarrivals['interarrival'] > 0].sample(100000)




In [209]:
fraudsamp['irank'] = fraudsamp['interarrival'].rank(pct=True, method="dense")
legitsamp['irank'] = legitsamp['interarrival'].rank(pct=True, method="dense")
qdf = pd.concat([fraudsamp.groupby(['label', 'interarrival', 'irank']).size(), legitsamp.groupby(['label', 'interarrival', 'irank']).size()])

In [223]:
qdf = pd.DataFrame(pd.DataFrame(qdf[qdf["interarrival"] > 0]).to_records())

In [224]:
qdf

Unnamed: 0,index,label,interarrival,irank,0
0,0,fraud,1.0,0.000113,5
1,1,fraud,2.0,0.000227,11
2,2,fraud,3.0,0.000340,12
3,3,fraud,4.0,0.000453,9
4,4,fraud,5.0,0.000567,7
...,...,...,...,...,...
32606,32657,legitimate,72171.0,0.999832,1
32607,32658,legitimate,73021.0,0.999874,1
32608,32659,legitimate,73036.0,0.999916,1
32609,32660,legitimate,73449.0,0.999958,1


In [212]:
[df[(df['irank'] < (0.1 * (i + 1))) & (df['irank'] >= (0.1 * i))].sample(25) for i in range(10) for df in [fraudsamp]]


[          user_id  label                date  interarrival     irank
 20574017     8960  fraud 2020-04-05 01:10:10         722.0  0.081822
 2937657      1279  fraud 2021-03-06 11:46:04         722.0  0.081822
 11319073     4926  fraud 2020-11-10 02:14:14         722.0  0.081822
 10241306     4456  fraud 2020-07-18 01:45:44         725.0  0.082162
 13845576     6033  fraud 2020-08-04 19:22:19         724.0  0.082049
 5112658      2225  fraud 2020-05-26 23:12:27         738.0  0.083636
 20033326     8720  fraud 2020-06-01 03:32:31         721.0  0.081709
 12545004     5458  fraud 2020-06-24 16:16:44         721.0  0.081709
 8025515      3504  fraud 2020-10-14 20:16:10         722.0  0.081822
 14891877     6486  fraud 2020-05-23 06:56:14         372.0  0.042158
 19595683     8530  fraud 2021-02-04 11:06:24         722.0  0.081822
 21322526     9280  fraud 2021-01-22 11:41:46         724.0  0.082049
 3861724      1680  fraud 2021-01-16 15:27:34         729.0  0.082616
 3425407      1496  

In [228]:
alt.Chart(qdf.sample(5000)).mark_line().encode(
    alt.Y("interarrival", axis=alt.Axis(title='interarrival time'), scale=alt.Scale(type='log')), 
    alt.X("irank", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
).interactive()

In [222]:
qdf.sample(5000).sort_values(['interarrival'])

Unnamed: 0,label,interarrival,irank,0
8824,legitimate,-33406263.0,0.000042,1
8827,legitimate,-33402617.0,0.000168,1
8831,legitimate,-33399786.0,0.000336,1
8839,legitimate,-33388210.0,0.000671,1
8846,legitimate,-33381269.0,0.000965,1
...,...,...,...,...
32619,legitimate,66511.0,0.998238,1
32638,legitimate,67209.0,0.999035,1
32644,legitimate,67498.0,0.999287,1
32649,legitimate,69068.0,0.999497,1
