In [1]:
import pandas as pd
import numpy as np

We'll start by reading in our fraud dataset and looking at the column names:

In [2]:
df = pd.read_csv("fraud.csv")
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign'],
      dtype='object')

# Transaction type distribution

In [3]:
pt = pd.pivot_table(df[["label", "trans_type", "timestamp"]], 
                    index=["label", "trans_type"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

Unnamed: 0,label,trans_type,count,total,pctage
0,fraud,chip_and_pin,21007,412839,0.050884
1,fraud,contactless,20678,412839,0.050087
2,fraud,manual,144390,412839,0.349749
3,fraud,online,206163,412839,0.499379
4,fraud,swipe,20601,412839,0.049901
5,legitimate,chip_and_pin,4507746,22533292,0.200048
6,legitimate,contactless,5631831,22533292,0.249934
7,legitimate,manual,1128292,22533292,0.050072
8,legitimate,online,7886722,22533292,0.350003
9,legitimate,swipe,3378701,22533292,0.149943


In [4]:
import altair as alt

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x="label", color='label'
)

# Foreign transaction distribution

In [5]:
pt = pd.pivot_table(df[["label", "foreign", "timestamp"]], 
                    index=["label", "foreign"], aggfunc=len)

pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())

gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']

gdf

alt.Chart(gdf).mark_bar().encode(
    alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x="label", color='label'
)

# Transaction amount distribution

In [26]:
%%time
qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
qs

CPU times: user 1min 21s, sys: 2.47 s, total: 1min 24s
Wall time: 1min 59s


Unnamed: 0_level_0,Unnamed: 1_level_0,amount
label,Unnamed: 1_level_1,Unnamed: 2_level_1
fraud,0.01,5.35
fraud,0.05,6.74
fraud,0.1,8.98
fraud,0.25,14.71
fraud,0.5,22.61
fraud,0.75,28.17
fraud,0.9,35.33
fraud,0.95,40.69
fraud,0.99,53.04
legitimate,0.01,3.31


In [32]:
qdf = pd.DataFrame(qs.to_records())
alt.Chart(qdf).mark_line(interpolate="monotone").encode(
    alt.Y("amount", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')), 
    alt.X("level_0", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')), 
    color="label"
)

# Interarrival times

In [59]:
trans_times = df[['timestamp', 'label', 'user_id']].sort_values(['user_id', 'timestamp'])
trans_times['date'] = pd.to_datetime(trans_times['timestamp'], unit='s')

In [60]:
trans_times.groupby(['user_id', 'label']).diff()

Unnamed: 0,timestamp,date
29775,,NaT
36379,9426.0,02:37:06
42580,8769.0,02:26:09
48212,8490.0,02:21:30
54290,8942.0,02:29:02
...,...,...
22902733,44815.0,12:26:55
22910628,11438.0,03:10:38
22917465,9991.0,02:46:31
22924314,9837.0,02:43:57


In [48]:
trans_times

Unnamed: 0,timestamp,label,user_id,date
0,1581630425,legitimate,956,2020-02-13 21:47:05
1,1581630425,legitimate,2776,2020-02-13 21:47:05
2,1581630426,legitimate,8467,2020-02-13 21:47:06
3,1581630427,legitimate,6188,2020-02-13 21:47:07
4,1581630431,legitimate,6275,2020-02-13 21:47:11
...,...,...,...,...
22946126,1615038912,legitimate,4652,2021-03-06 13:55:12
22946127,1615038912,legitimate,9846,2021-03-06 13:55:12
22946128,1615038917,legitimate,7089,2021-03-06 13:55:17
22946129,1615038918,legitimate,4275,2021-03-06 13:55:18


In [41]:
trans_times['timestamp'] = pd.to_datetime(trans_times['timestamp'], unit='s')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
trans_times

Unnamed: 0,timestamp,label,user_id
0,1970-01-01 00:00:01.581630425,legitimate,956
1,1970-01-01 00:00:01.581630425,legitimate,2776
2,1970-01-01 00:00:01.581630426,legitimate,8467
3,1970-01-01 00:00:01.581630427,legitimate,6188
4,1970-01-01 00:00:01.581630431,legitimate,6275
...,...,...,...
22946126,1970-01-01 00:00:01.615038912,legitimate,4652
22946127,1970-01-01 00:00:01.615038912,legitimate,9846
22946128,1970-01-01 00:00:01.615038917,legitimate,7089
22946129,1970-01-01 00:00:01.615038918,legitimate,4275


In [38]:
def diff(x):
    return x[1] - x[0]

interarrivals.apply(diff)

KeyError: 0

In [None]:
de