In [12]:
import torch_geometric

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import scipy.sparse as sp
from torch_geometric_temporal.nn.recurrent import EvolveGCNO

import warnings
warnings.filterwarnings("ignore")

# ref: https://medium.com/stanford-cs224w/fraud-detection-with-gat-edac49bda1a0

#### dataset

In [10]:
# import data 
df_features = pd.read_csv('../data/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("../data/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("../data/elliptic_txs_classes.csv")

df_classes['class'] = df_classes['class'].map({'unknown': 2, '1':1, '2':0})

# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
df_merge.drop(0, axis=1, inplace=True)

# check if there are duplicate txId
# print("Number of duplicate txId: ", df_merge.duplicated(subset=['txId']).sum())

# rename column 0 to time_step
df_merge.rename(columns={1: 'time_step'}, inplace=True)
# display(df_merge.head())
# display(df_edges.shape)
edges = df_edges.copy()

# Setup trans ID to node ID mapping
nodes = df_merge['txId'].values
map_id = {j:i for i,j in enumerate(nodes)} # mapping nodes to indexes

# Map transction IDs to node Ids
edges.txId1 = edges.txId1.map(map_id) #get nodes idx1 from edges list and filtered data
edges.txId2 = edges.txId2.map(map_id)
edges = edges.astype(int)

# Reformat and convert to tensor
edge_index = np.array(edges.values).T 
edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()

# print("shape of edge index is {}".format(edge_index.shape))
node_features = df_merge.drop(['txId'], axis=1).copy()
# print("unique=",node_features["class"].unique())

# Retain known vs unknown IDs
all_classified_idx = node_features['class'].loc[node_features['class']!=2].index # filter on known labels
all_unclassified_idx = node_features['class'].loc[node_features['class']==2].index
all_classified_illicit_idx = node_features['class'].loc[node_features['class']==1].index # filter on illicit labels
all_classified_licit_idx = node_features['class'].loc[node_features['class']==0].index # filter on licit labels

# node_features = node_features.drop(columns=[0, 1, 'class'])
# display(node_features.head())
train_classified_idx = node_features.loc[(node_features['time_step'] <= 34) & (node_features['class'] != 2)].index
test_classified_idx = node_features.loc[(node_features['time_step'] > 34) & (node_features['class'] != 2)].index
# print("train_classified_idx.shape=",train_classified_idx.shape)
# print("test_classified_idx.shape=",test_classified_idx.shape)
# node_features.drop(columns=['time_step'], inplace=True)
node_features.drop(columns=['class'], inplace=True)

# Convert to tensor
node_features_t = torch.tensor(np.array(node_features.values, dtype=np.double), dtype=torch.double)
# Define labels
labels = df_merge['class'].values

#create weights tensor with same shape of edge_index
weights = torch.tensor([1]* edge_index.shape[1] , dtype=torch.double) 

# Do train test split on classified_ids
train_idx = train_classified_idx
test_idx = test_classified_idx

# Create pyG dataset
data_graph = Data(x=node_features_t.float(), edge_index=edge_index, edge_attr=weights, 
                               y=torch.tensor(labels, dtype=torch.long))

# Add in the train and valid idx
data_graph.train_idx = train_idx
data_graph.test_idx = test_idx
data_graph

Data(x=[203769, 166], edge_index=[2, 234355], edge_attr=[234355], y=[203769], train_idx=Int64Index([     3,      9,     10,     11,     16,     17,     25,     27,
                29,     30,
            ...
            136232, 136233, 136234, 136236, 136239, 136241, 136243, 136249,
            136250, 136258],
           dtype='int64', length=29894), test_idx=Int64Index([136276, 136277, 136278, 136279, 136280, 136282, 136285, 136287,
            136288, 136291,
            ...
            203727, 203730, 203736, 203740, 203750, 203752, 203754, 203759,
            203763, 203766],
           dtype='int64', length=16670))

#### EvolveGCN Model and Training

In [26]:
class EvolveGCN(torch.nn.Module):
    def __init__(self, node_features, num_classes):
        super(EvolveGCN, self).__init__()
        self.node_features = node_features
        self.num_classes = num_classes
        self.base_model = EvolveGCNO(self.node_features)
        self.fc = torch.nn.Linear(self.node_features, self.num_classes)

    def forward(self, data):
        out = self.base_model(data.x, data.edge_index, data.edge_attr)
        out = out.float() 
        out = self.fc(out)  
        return out


In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [27]:
def train(model, data, optimizer):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data)
    # out = out.reshape((data.x.shape[0]))
    # TODO :use weighted cross entropy loss
    loss = F.cross_entropy(out[data.train_idx], data.y[data.train_idx])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(device)
        out = model(data)
        pred_scores = out[data.test_idx]
        pred = torch.argmax(pred_scores, dim=1)
        y = data.y[data.test_idx]
        acc = accuracy_score(y.cpu(), pred.cpu())
        f1 = f1_score(y.cpu(), pred.cpu())
        precision = precision_score(y.cpu(), pred.cpu())
        recall = recall_score(y.cpu(), pred.cpu())
        roc = roc_auc_score(y.cpu(), pred.cpu())
        return acc, f1, precision, recall, roc

In [24]:
num_features = data_graph.num_node_features
print("num_features=",num_features)

num_features= 166


In [28]:
model = EvolveGCN(num_features, 2).to(device)
num_epochs = 200
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs+1):
    loss = train(model, data_graph, optimizer)
    acc, f1, precision, recall, roc = test(model, data_graph)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Accuracy: {acc:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, ROC: {roc:.4f}')

Epoch: 000, Loss: 2.3002, Accuracy: 0.0870, F1: 0.1231, Precision: 0.0656, Recall: 0.9861, ROC: 0.5054
Epoch: 010, Loss: 0.6439, Accuracy: 0.9350, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, ROC: 0.5000
Epoch: 020, Loss: 0.3534, Accuracy: 0.7423, F1: 0.2679, Precision: 0.1643, Recall: 0.7258, ROC: 0.7346
Epoch: 030, Loss: 0.3096, Accuracy: 0.9342, F1: 0.0000, Precision: 0.0000, Recall: 0.0000, ROC: 0.4996
Epoch: 040, Loss: 0.2834, Accuracy: 0.8406, F1: 0.2567, Precision: 0.1841, Recall: 0.4238, ROC: 0.6467
Epoch: 050, Loss: 0.2654, Accuracy: 0.9268, F1: 0.0145, Precision: 0.0577, Recall: 0.0083, ROC: 0.4994
Epoch: 060, Loss: 0.2546, Accuracy: 0.8652, F1: 0.2905, Precision: 0.2207, Recall: 0.4247, ROC: 0.6603
Epoch: 070, Loss: 0.2451, Accuracy: 0.8929, F1: 0.2680, Precision: 0.2410, Recall: 0.3019, ROC: 0.6179
Epoch: 080, Loss: 0.2383, Accuracy: 0.8640, F1: 0.2940, Precision: 0.2218, Recall: 0.4358, ROC: 0.6648
Epoch: 090, Loss: 0.2329, Accuracy: 0.8419, F1: 0.3042, Precision: 0.2130

In [None]:
# test 
acc, f1, precision, recall, roc = test(model, data_graph)
print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, ROC: {roc:.4f}")

Accuracy: 0.8443, F1: 0.3317, Precision: 0.2300, Recall: 0.5946, ROC: 0.7282
