In [1]:
import dgl
import numpy as np
import pandas as pd
import torch

from dgl.data.utils import load_graphs, save_graphs

In [2]:
graph, label_dict = load_graphs('data/tfinance')
graph = graph[0]
graph.ndata['label'] = graph.ndata['label'].argmax(1)

In [3]:
graph.edges()

(tensor([    0,     0,     0,  ..., 39356, 39356, 39356]),
 tensor([20227,  1031, 26251,  ..., 34798, 35855, 36455]))

In [4]:
edges = pd.DataFrame({'ID1': graph.edges()[0], 'ID2': graph.edges()[1]})
features = pd.DataFrame(graph.ndata['feature'].numpy(), columns=[f'feature{i+1}' for i in range(10)])
labels = pd.DataFrame(graph.ndata['label'].numpy(), columns=['label'])

#### Convert to PyG

In [5]:
import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

In [6]:
edge_index = np.array(edges.values).T 
edge_index = torch.tensor(edge_index, dtype=torch.long)

print("shape of edge index is {}".format(edge_index.shape))

node_features_t = torch.tensor(np.array(features.values, dtype=np.double), dtype=torch.double)

# Create pyG dataset
data_graph = Data(x=node_features_t.float(), edge_index=edge_index,
                               y=torch.tensor(labels.values, dtype=torch.long))

shape of edge index is torch.Size([2, 42445086])


In [7]:
from sklearn.model_selection import train_test_split

# Split train, test
train_idx, test_idx = train_test_split(range(data_graph.num_nodes), test_size=0.3, random_state=42)

data_graph.train_idx = torch.zeros(data_graph.num_nodes, dtype=torch.bool)
data_graph.train_idx[train_idx] = 1

data_graph.test_idx = torch.zeros(data_graph.num_nodes, dtype=torch.bool)
data_graph.test_idx[test_idx] = 1


In [8]:
# 2-layer GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels=128):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.conv2(x, edge_index)
        return x

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [10]:
# CLASS_WEIGTHS = [0.7,0.3]

def train(model, data, optimizer):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    # out = out.reshape((data.x.shape[0]))
    # # use weighted cross entropy loss, weighted cross entropy loss to provide higher importance to the illicit samples.
    # weights = torch.tensor(CLASS_WEIGTHS, dtype=torch.float).to(device)
    # loss = F.cross_entropy(out[data.train_idx], data.y[data.train_idx], weight=weights)
    loss = F.cross_entropy(out[data.train_idx], data.y[data.train_idx])
    loss.backward()
    optimizer.step()
    return loss

def test(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(device)
        out = model(data.x, data.edge_index)
        pred_scores = out[data.test_idx]
        pred = torch.argmax(pred_scores, dim=1)
        y = data.y[data.test_idx]
        # metrics for illicit transactions
        acc = accuracy_score(y.cpu(), pred.cpu())
        f1 = f1_score(y.cpu(), pred.cpu(), average='binary')
        precision = precision_score(y.cpu(), pred.cpu(), average='binary')
        recall = recall_score(y.cpu(), pred.cpu(), average='binary')
        auc = roc_auc_score(y.cpu(), pred_scores[:,1].cpu())
        return acc, f1, precision, recall, auc

In [11]:
num_features = data_graph.num_node_features
print("num_features=",num_features)

num_features= 10


In [12]:
model = GCN(num_features, 2).to(device)
num_epochs = 500
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs+1):
    loss = train(model, data_graph, optimizer)
    acc, f1, precision, recall, roc = test(model, data_graph)
    # if epoch % 10 == 0:
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Accuracy: {acc:.4f}, F1: {f1:.4f}, Precision: {precision: .4f}, Recall: {recall:.4f}, ROC: {roc:.4f}')