In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import sklearn as sci
%matplotlib inline
%load_ext autoreload
%autoreload

df_ori = pd.read_csv('adyen-dataset.csv')
from lib.sampling import subsampling

# Subsample non_fraudulent transactions records so we have balanced dataset
df_fraudulent = df_ori[df_ori['has_fraudulent_dispute'] == True]
df_non_fraudulent = df_ori[df_ori['has_fraudulent_dispute'] == False]
subsample_index= subsampling(df_non_fraudulent.index, len(df_fraudulent))
df_non_fraudulent_subsample  = df_non_fraudulent.loc[subsample_index,:]
df_sample = pd.concat([df_non_fraudulent_subsample,df_fraudulent], axis=0)
df_sample.shape

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


(21432, 20)

In [36]:
df_sample.head()

Unnamed: 0,psp_reference,merchant,card_scheme,year,hour_of_day,minute_of_hour,day_of_year,is_credit,eur_amount,ip_country,issuing_country,device_type,ip_address,email_address,card_number,shopper_interaction,zip_code,card_bin,has_fraudulent_dispute,is_refused_by_adyen
100589,66904306057,Merchant A,Amex,2021,14,33,221,True,119.63,BR,BR,iOS,YWjJwmlGJPixXGE7ZiOc4Q,vsN6dDbqNIUqyUeIbhBITg,RjBk2c9SfCBlEj6aLUmRhg,Ecommerce,BZD,4571,False,True
132541,62106400109,Merchant B,Visa,2021,17,38,246,True,106.18,ZW,ZW,Linux,Bhh4hDFEn9SUmPl7DewmQA,WUC-fSELqJQytILNQ_0t4w,pba__PLPD4rLYK47CM5FzQ,Ecommerce,DB,4236,False,False
128249,42993811077,Merchant D,MasterCard,2021,9,57,57,True,87.62,BR,GR,Linux,226YyIHbGmxQYR0GmCHGFw,jXsu5-CEfb-MIUbeNmwSjg,cwzUvZ9jRQVfj6qqmvOsZw,POS,BZD,4916,False,False
92433,74853496154,Merchant A,Other,2021,17,21,175,True,18.64,BR,BR,MacOS,,fDVfUMF3SCYrOilj9x_ZIw,FvjlLMaOnhk88dByaG3Geg,Ecommerce,BZD,4622,False,False
132819,68447486793,Merchant D,Other,2021,7,28,10,True,120.65,BR,BR,Other,,oPosT9sFfAE0yPXqJuKnng,O8Nz0WMnwQsdxSX68o7kKg,Ecommerce,BZD,4891,False,False


In [37]:
total_fraud = df_sample[df_sample['has_fraudulent_dispute']==1].shape[0]
total_not_fraud = df_sample[df_sample['has_fraudulent_dispute']==0].shape[0]
total_fraud_na_ip = df_sample[df_sample['ip_address'].isna() & df_sample['has_fraudulent_dispute']==1].shape[0]
total_not_fraud_na_ip = df_sample[df_sample['ip_address'].isna() & df_sample['has_fraudulent_dispute']==0].shape[0]
p_likelihood_fraud = total_fraud_na_ip/total_fraud
p_likelihood_not_fraud = total_not_fraud_na_ip/total_not_fraud
p_prior_no_fraud = 0.5
p_prior_fraud = 0.5
p_posterio = p_likelihood_fraud * p_prior_fraud/(p_likelihood_fraud * p_prior_fraud + p_likelihood_not_fraud * p_prior_no_fraud)
p_posterio

0.0998040313549832

In [38]:
df_sample = df_sample.fillna("anonymous")

In [59]:
edges1 = df_sample[df_sample["has_fraudulent_dispute"]==1].groupby(["ip_address","card_number"], as_index=False).size()
edges1['type_in'] = 'ip'
edges1['type_out'] = 'card'
edges1.columns=["point_in","point_out","fraud","type_in","type_out"]
edges1

Unnamed: 0,point_in,point_out,fraud,type_in,type_out
0,--2E0toluie1TqMY-oXrQg,GOgdL2ymYoPD99nIT1zbkA,1,ip,card
1,--2E0toluie1TqMY-oXrQg,vtGsRvD7aYpa-LJdW8sbyg,5,ip,card
2,-0jK02r79jRQ2Nzmcbd51Q,GaTo2bLUxy-n939nv2g-ag,1,ip,card
3,-0mAboSyixPty7xuAAnWTA,E3wWIIUb7J4HBwfTbCTB2w,1,ip,card
4,-1I5QWw03mL1yTPQQfQd1w,gZXwSBiQkKyDo3fTH-tk0w,1,ip,card
...,...,...,...,...,...
4305,zts5U45eQW0WfrybeY9Ugg,SswcaXt2_RSpky83El-p2Q,1,ip,card
4306,zts5U45eQW0WfrybeY9Ugg,noc__i9FYFTvMeASho4qhw,1,ip,card
4307,zts5U45eQW0WfrybeY9Ugg,vJRsX04omE6ga--oPTv6Ww,1,ip,card
4308,zts5U45eQW0WfrybeY9Ugg,wlLqoBvhXhx-rIG_vmyZiQ,1,ip,card


In [58]:
edges2 = df_sample[df_sample["has_fraudulent_dispute"]==1].groupby(["ip_address","email_address"],as_index=False).size()
edges2['type_in'] = 'ip'
edges2['type_out'] = 'email'
edges2.columns=["point_in","point_out","fraud","type_in","type_out"]
edges2

Unnamed: 0,point_in,point_out,fraud,type_in,type_out
0,--2E0toluie1TqMY-oXrQg,eErppXRkq-p_C8ginsSiOQ,6,ip,email
1,-0jK02r79jRQ2Nzmcbd51Q,anonymous,1,ip,email
2,-0mAboSyixPty7xuAAnWTA,wlpo3xiAoV0KC0ndCnyiQg,1,ip,email
3,-1I5QWw03mL1yTPQQfQd1w,lBEp5B7QqM5AqZcdXdBt-g,1,ip,email
4,-1I5QWw03mL1yTPQQfQd1w,qdCicvQuh_YNyVXyzeiXCA,1,ip,email
...,...,...,...,...,...
4310,zts5U45eQW0WfrybeY9Ugg,LVh-jiD2Dogmy4YeqBsiNw,1,ip,email
4311,zts5U45eQW0WfrybeY9Ugg,SLJa4df8bZQtuJMF2Y4NgA,1,ip,email
4312,zts5U45eQW0WfrybeY9Ugg,anonymous,4,ip,email
4313,zts5U45eQW0WfrybeY9Ugg,oLqbhlpxM8EdllHizNnIlQ,17,ip,email


In [48]:
edges3 =df_sample[df_sample["has_fraudulent_dispute"]==1].groupby(["email_address","card_number"],as_index=False).size()
edges3['type_in'] = 'email'
edges3['type_out'] = 'card'
edges3.columns=["point_in","point_out","fraud","type_in","type_out"]
edges3

Unnamed: 0,point_in,point_out,fraud,type_in,type_out
0,-AJhnA6_gxPrBE3SY4Ko2w,HY-lZNO3-1jR0FjsCiTuag,1,email,card
1,-AcOm9tITPMYbamNfLrFgA,mZgywJHD5WlimhuYMNMbIw,13,email,card
2,-ApLJ_HpGntlSLAJsmr7xw,suWNfD7D8xzJQLfkvhC4HA,1,email,card
3,-IZIkqr9zhxawUjAFqJPWg,Tnwcm9UwXtToEb1icFwU9g,1,email,card
4,-Iu93ZNuaMTZixT0Hq7ZRQ,kJn9S5qrmBtAEahJLfiSIQ,1,email,card
...,...,...,...,...,...
2999,zw75Ce9YMASAARak28wtSA,Ymg5-cJa5bs2cbncNNOYJg,1,email,card
3000,zySCXubGnjkcnAoIgX162g,Ueta-Qw220ds-xXWk2Uddg,2,email,card
3001,zyScyyZ73LcFzOz7FcJWow,HDS7DP0tNe5fUu1twTy17Q,1,email,card
3002,zyyDrH9wronGeR0FKCU0kQ,a7J4OkbL30z_vZZG1ngTkQ,12,email,card


In [49]:
df_edges = pd.concat([edges1,edges2,edges3], axis=0)
df_edges

Unnamed: 0,point_in,point_out,fraud,type_in,type_out
0,--2E0toluie1TqMY-oXrQg,GOgdL2ymYoPD99nIT1zbkA,1,ip,card
1,--2E0toluie1TqMY-oXrQg,vtGsRvD7aYpa-LJdW8sbyg,5,ip,card
2,-0jK02r79jRQ2Nzmcbd51Q,GaTo2bLUxy-n939nv2g-ag,1,ip,card
3,-0mAboSyixPty7xuAAnWTA,E3wWIIUb7J4HBwfTbCTB2w,1,ip,card
4,-1I5QWw03mL1yTPQQfQd1w,gZXwSBiQkKyDo3fTH-tk0w,1,ip,card
...,...,...,...,...,...
2999,zw75Ce9YMASAARak28wtSA,Ymg5-cJa5bs2cbncNNOYJg,1,email,card
3000,zySCXubGnjkcnAoIgX162g,Ueta-Qw220ds-xXWk2Uddg,2,email,card
3001,zyScyyZ73LcFzOz7FcJWow,HDS7DP0tNe5fUu1twTy17Q,1,email,card
3002,zyyDrH9wronGeR0FKCU0kQ,a7J4OkbL30z_vZZG1ngTkQ,12,email,card


In [None]:
import networkx as nx

source = 'point_in'
target = 'point_out'

G = nx.from_pandas_edgelist(df_edges, source=source, target = target, edge_attr='fraud')

In [61]:
# generate node color map
nodes_color_map_list = []
type_color_map = {
    "ip": "yellow",
    "card": "blue",
    "email": "red",
}
for v in G.nodes():
    if len(df_sample[df_sample['ip_address'] == v]) > 0:
        nodes_color_map_list.append(type_color_map['ip'])
        continue
    if len(df_sample[df_sample['card_number'] == v]) > 0:
        nodes_color_map_list.append(type_color_map['card'])
        continue
    if len(df_sample[df_sample['email_address'] == v]) > 0:
        nodes_color_map_list.append(type_color_map['email'])
        continue
    raise Exception("Sorry, not matched label")
nodes_color_map_list

['yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'blue',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'blue',
 'yellow',
 'blue',
 'yellow',
 'blue',
 'yel

In [62]:
#create a larger figure
plt.figure(figsize=(32, 32))

#place nodes according to the force-directed algorithm
pos = nx.spring_layout(G, k=25, iterations=1000, weight=2, scale=5) # output a dict with the country code as key and pos as value
nx.draw_networkx(G, pos=pos, with_labels=False, node_color=nodes_color_map_list, edge_color="lightgray", alpha=0.6, node_size=10)
plt.show()


KeyboardInterrupt



<Figure size 2304x2304 with 0 Axes>