In [1]:
import torch
import pandas as pd
import numpy as np
import networkx as nx

df_sample = pd.read_csv("adyen-dataset.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [78]:
edges1 = df_sample.groupby(["card_number", "ip_address"]).agg(total_transactions=('has_fraudulent_dispute','size'), total_fraud=('has_fraudulent_dispute','sum')).reset_index()
edges1['type_in'] = 'card'
edges1['type_out'] = 'ip'
edges1.columns=["point_in","point_out","total_transactions","total_fraud","type_in","type_out"]

edges2 = df_sample.groupby(["ip_address","email_address"]).agg(total_transactions=('has_fraudulent_dispute','size'), total_fraud=('has_fraudulent_dispute','sum')).reset_index()
edges2['type_in'] = 'ip'
edges2['type_out'] = 'email'
edges2.columns=["point_in","point_out","total_transactions","total_fraud","type_in","type_out"]

edges3 =df_sample.groupby(["email_address","card_number"]).agg(total_transactions=('has_fraudulent_dispute','size'), total_fraud=('has_fraudulent_dispute','sum')).reset_index()
edges3['type_in'] = 'email'
edges3['type_out'] = 'card'
edges3.columns=["point_in","point_out","total_transactions","total_fraud","type_in","type_out"]

In [93]:
df_edges = pd.concat([edges1,edges2,edges3], axis=0)
df_edges = df_edges[df_edges['total_transactions']>0]
df_edges.sort_values(by="point_in")
df_edges

Unnamed: 0,point_in,point_out,total_transactions,total_fraud,type_in,type_out
0,---Lb7c97vCTnUmcZy_IbQ,kCaO4Fh9ANb6DImeer4ZwA,3,0,card,ip
1,--3JjPs6q_dHrf8qXwBRpg,MEQyapFPtk5KUqDp8d8PBw,1,0,card,ip
2,--5EMhocJqROxziBYT2R0Q,qheCkbzaWNW9MRf22bAm2g,4,0,card,ip
3,--7oaSPOkc1Ibv-nerM42Q,VvWBVLvuV1VmwpQzSnMDxQ,1,0,card,ip
4,--93Cfxzu0h3n1oAc90Spw,OXFeETjcJzoREN7jcDjIZg,1,0,card,ip
...,...,...,...,...,...,...
54493,zzl9fCinW9CkKIZtaRIkAA,dvtzLaNWVVDgMVSrUUilBQ,2,0,email,card
54494,zznixqMdaC80Mfrz9SgxRA,0Y5dHu16ZVyLhbBRarCbSA,2,0,email,card
54495,zznixqMdaC80Mfrz9SgxRA,PQhIPrnsTy12h6AxnwH9fg,1,0,email,card
54496,zzuPJQ385bLW-LfOxmkHDA,k19XpWxAkPdjX07LtKcWDw,1,0,email,card


In [115]:
fraud_nodes = df_edges.groupby(["point_in"]).agg(total_transactions = ("total_transactions", "sum"), total_fraud = ("total_fraud", "sum"))
fraud_nodes["fraud_odds"] = fraud_nodes["total_fraud"]/fraud_nodes["total_transactions"]
fraud_nodes = fraud_nodes[["fraud_odds"]].sort_index()
fraud_nodes

Unnamed: 0_level_0,fraud_odds
point_in,Unnamed: 1_level_1
---Lb7c97vCTnUmcZy_IbQ,0.000000
--2E0toluie1TqMY-oXrQg,0.666667
--3JjPs6q_dHrf8qXwBRpg,0.000000
--4uxXYUwFiweL52RP72aQ,0.000000
--58rRRUeyFG-y_jcju51Q,0.000000
...,...
zzvdhyni6XHUDAY3auneAg,0.000000
zzvs1aPirP2a86J_0zWqYA,0.000000
zzwlU4l9eDr51NKJSLr5UA,0.000000
zzyB5mf6IXzsAmx64BVfWg,0.000000


In [117]:
G = nx.from_pandas_edgelist(df_edges.sort_values(by="point_in"), source='point_in', target = 'point_out', edge_attr='total_transactions')

In [118]:
from torch.nn import Linear
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
  def __init__(self):
    super(GCN, self).__init__()
    torch.manual_seed(42)
    self.conv1 = GCNConv(146440, 4)
    self.conv2 = GCNConv(4, 4)
    self.conv3 = GCNConv(4, 2)
    self.classifier = Linear(2, 2)
  def forward(self, x, edge_index):
    h = self.conv1(x, edge_index)
    h = h.tanh()
    h = self.conv2(h, edge_index)
    h = h.tanh()
    h = self.conv3(h, edge_index)
    h = h.tanh()
    out = self.classifier(h)
    return out, h
model = GCN()
print(model)

GCN(
  (conv1): GCNConv(146440, 4)
  (conv2): GCNConv(4, 4)
  (conv3): GCNConv(4, 2)
  (classifier): Linear(in_features=2, out_features=2, bias=True)
)
