In [2]:
import dgl
import numpy as np
import pandas as pd
import torch

from dgl.data.utils import load_graphs, save_graphs

In [2]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
graph, label_dict = load_graphs('data/tfinance/tfinance')
graph = graph[0]
graph.ndata['label'] = graph.ndata['label'].argmax(1)

In [4]:
graph.edges()

(tensor([    0,     0,     0,  ..., 39356, 39356, 39356]),
 tensor([20227,  1031, 26251,  ..., 34798, 35855, 36455]))

In [5]:
edges = pd.DataFrame({'ID1': graph.edges()[0], 'ID2': graph.edges()[1]})
features = pd.DataFrame(graph.ndata['feature'].numpy(), columns=[f'feature{i+1}' for i in range(10)])
labels = pd.DataFrame(graph.ndata['label'].numpy(), columns=['label'])

In [123]:
# # edges = df_edges.copy()

# # Reformat and convert to tensor
# edge_index = np.array(edges.values).T 
# edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()

# print("shape of edge index is {}".format(edge_index.shape))

shape of edge index is torch.Size([2, 42445086])


In [6]:
sample_size = 15000
selected_nodes = features.sample(sample_size, random_state=42)
selected_idx = list(selected_nodes.index)
selected_nodes = selected_nodes.reset_index(drop=True)
selected_nodes

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10
0,1296.0,259.0,2.0,12.0,36.0,52.0,14.0,40.0,0.229938,0.083333
1,1943.0,1838.0,312.0,72.0,34.0,491.0,6.0,486.0,0.091096,0.029336
2,2306.0,573.0,53.0,19.0,283.0,172.0,48.0,141.0,0.096704,0.022984
3,14.0,908.0,13.0,0.0,871.0,26.0,1.0,25.0,0.285714,0.071429
4,499.0,530.0,6.0,7.0,404.0,53.0,22.0,32.0,0.232465,0.146293
...,...,...,...,...,...,...,...,...,...,...
14995,367.0,244.0,27.0,17.0,26.0,66.0,7.0,61.0,0.564033,0.564033
14996,79.0,8587.0,3.0,5.0,8578.0,6.0,1.0,5.0,0.101266,0.025316
14997,329.0,1634.0,80.0,19.0,29.0,223.0,10.0,221.0,0.291793,0.188450
14998,133.0,7342.0,16.0,7.0,208.0,124.0,11.0,122.0,0.428571,0.300752


In [7]:
selected_edges = edges[edges['ID1'].isin(selected_idx) & edges['ID2'].isin(selected_idx)]
selected_edges

Unnamed: 0,ID1,ID2
0,0,20227
2,0,26251
4,0,1043
6,0,30363
8,0,28575
...,...,...
42445012,39355,28263
42445013,39355,29191
42445023,39355,33799
42445024,39355,34430


In [8]:
selected_labels = labels.loc[selected_idx]
selected_labels

Unnamed: 0,label
10013,0
15505,0
2351,0
39273,0
1451,0
...,...
34601,0
33792,0
29354,0
28539,0


#### Convert to PyG

In [9]:
import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import scipy.sparse as sp
from torch_geometric.loader import DataLoader as pyg_DataLoader


import warnings
warnings.filterwarnings("ignore")

In [10]:
# Setup trans ID to node ID mapping
nodes = list(features.index)
map_id = {selected_idx[i]:i for i in range(sample_size)} # mapping nodes to indexes

# Map transction IDs to node Ids
selected_edges.ID1 = selected_edges.ID1.map(map_id) #get nodes idx1 from edges list and filtered data
selected_edges.ID2 = selected_edges.ID2.map(map_id)
selected_edges = selected_edges.astype(int)

edge_index = np.array(selected_edges.values).T 
edge_index = torch.tensor(edge_index, dtype=torch.long)
print(edge_index)

print("shape of edge index is {}".format(edge_index.shape))

# node_features_t = torch.tensor(np.array(features.values, dtype=np.double), dtype=torch.double)
node_features_t = torch.tensor(np.array(selected_nodes.values, dtype=np.double), dtype=torch.double)

# Create pyG dataset
data_graph = Data(x=node_features_t.float(), edge_index=edge_index,
                               y=torch.tensor(selected_labels.values.flatten(), dtype=torch.long))

tensor([[12230, 12230, 12230,  ..., 10945, 10945, 10945],
        [ 6579,  8455,   272,  ...,  8153,  6480,  9753]])
shape of edge index is torch.Size([2, 6032438])


In [11]:
from sklearn.model_selection import train_test_split

# Split train, test
train_idx, test_idx = train_test_split(range(sample_size), test_size=0.3, random_state=42)

data_graph.train_idx = torch.zeros(sample_size, dtype=torch.bool)
data_graph.train_idx[train_idx] = 1

data_graph.test_idx = torch.zeros(sample_size, dtype=torch.bool)
data_graph.test_idx[test_idx] = 1

In [51]:
# 2-layer GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels=128):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.conv2(x, edge_index)
        return x

In [52]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
device

'cpu'

In [53]:
# CLASS_WEIGTHS = [0.7,0.3]

def train(model, data, optimizer):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    # out = out.reshape((data.x.shape[0]))
    # # use weighted cross entropy loss, weighted cross entropy loss to provide higher importance to the illicit samples.
    # weights = torch.tensor(CLASS_WEIGTHS, dtype=torch.float).to(device)
    # loss = F.cross_entropy(out[data.train_idx], data.y[data.train_idx], weight=weights)
    
    loss = F.cross_entropy(out[data.train_idx], data.y[data.train_idx])
    loss.backward()
    optimizer.step()
    return loss

def test(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(device)
        out = model(data.x, data.edge_index)
        pred_scores = out[data.test_idx]
        pred = torch.argmax(pred_scores, dim=1)
        y = data.y[data.test_idx]
        # metrics for illicit transactions
        acc = accuracy_score(y.cpu(), pred.cpu())
        f1 = f1_score(y.cpu(), pred.cpu(), average='binary')
        precision = precision_score(y.cpu(), pred.cpu(), average='binary')
        recall = recall_score(y.cpu(), pred.cpu(), average='binary')
        auc = roc_auc_score(y.cpu(), pred_scores[:,1].cpu())
        return acc, f1, precision, recall, auc

In [54]:
num_features = data_graph.num_node_features
print("num_features=",num_features)

num_features= 10


In [55]:
model = GCN(num_features, 2).to(device)
num_epochs = 200
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs+1):
    loss = train(model, data_graph, optimizer)
    acc, f1, precision, recall, roc = test(model, data_graph)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Accuracy: {acc:.4f}, F1: {f1:.4f}, Precision: {precision: .4f}, Recall: {recall:.4f}, ROC: {roc:.4f}')

Epoch: 000, Loss: 375.1660, Accuracy: 0.1744, F1: 0.0954, Precision:  0.0504, Recall: 0.9074, ROC: 0.8093
Epoch: 010, Loss: 26.8001, Accuracy: 0.9522, F1: 0.0183, Precision:  0.6667, Recall: 0.0093, ROC: 0.7960
Epoch: 020, Loss: 33.8413, Accuracy: 0.9520, F1: 0.0092, Precision:  0.5000, Recall: 0.0046, ROC: 0.7957
Epoch: 030, Loss: 32.5310, Accuracy: 0.9522, F1: 0.0183, Precision:  0.6667, Recall: 0.0093, ROC: 0.7971
Epoch: 040, Loss: 27.5327, Accuracy: 0.9522, F1: 0.0183, Precision:  0.6667, Recall: 0.0093, ROC: 0.7995
Epoch: 050, Loss: 20.6009, Accuracy: 0.9549, F1: 0.1212, Precision:  0.9333, Recall: 0.0648, ROC: 0.8035
Epoch: 060, Loss: 12.4452, Accuracy: 0.9544, F1: 0.1202, Precision:  0.8235, Recall: 0.0648, ROC: 0.8099
Epoch: 070, Loss: 4.1538, Accuracy: 0.8924, F1: 0.0763, Precision:  0.0649, Recall: 0.0926, ROC: 0.8209
Epoch: 080, Loss: 3.2037, Accuracy: 0.9371, F1: 0.1184, Precision:  0.1810, Recall: 0.0880, ROC: 0.8239
Epoch: 090, Loss: 2.1298, Accuracy: 0.9162, F1: 0.3599, 

In [60]:
selected_edges.to_csv('data/tfinance/selected_edges.csv', index=False)
selected_nodes.to_csv('data/tfinance/selected_nodes.csv', index=False)
selected_labels.to_csv('data/tfinance/selected_labels.csv', index=False)

In [64]:
# save the train and test indices as csv, integer
np.savetxt("data/tfinance/index/train_idx.csv", train_idx, delimiter=",", fmt='%d')
np.savetxt("data/tfinance/index/test_idx.csv", test_idx, delimiter=",", fmt='%d')

In [12]:
train_idx = pd.read_csv('data/tfinance/index/train_idx.csv', names=['id'])
test_idx = pd.read_csv('data/tfinance/index/test_idx.csv', names=['id'])

X_train = selected_nodes.loc[train_idx['id'].tolist()]
y_train = selected_labels.reset_index().loc[train_idx['id'].tolist()]

X_test = selected_nodes.loc[test_idx['id'].tolist()]
y_test = selected_labels.reset_index().loc[test_idx['id'].tolist()]

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_features=50, random_state=42)

# Train the classifier
rf.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
rocauc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

print("Accuracy: ", accuracy)
print("F1:       ", f1)
print("Precision:", precision)
print("Recall:   ", recall)
print("ROC AUC:  ", rocauc)