In [1]:
import torch_geometric

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

# ref: https://www.kaggle.com/code/divyareddyyeruva/elliptic-gcn-pyg

#### Import dataset

In [2]:
# import data 
df_features = pd.read_csv('data/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("data/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("data/elliptic_txs_classes.csv")
# map unknown classes to 0
df_classes['class'] = df_classes['class'].apply(lambda x: 0 if x == "unknown" else int(x))

# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
df_merge.drop(0, axis=1, inplace=True)

# check if there are duplicate txId
print("Number of duplicate txId: ", df_merge.duplicated(subset=['txId']).sum())


Number of duplicate txId:  0


In [3]:
# set txId to index
nodes = df_merge["txId"].values
map_id = {j:i for i,j in enumerate(nodes)} # mapping nodes to indexes

# map nodes to indexes
df_merge["txId"] = df_merge["txId"].map(map_id)

df_edges["txId1"] = df_edges["txId1"].map(map_id)
df_edges["txId2"] = df_edges["txId2"].map(map_id)

display(df_merge.head())
display(df_edges.head())

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,txId,class
0,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,-0.167933,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,0,0
1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,-0.167948,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,1,0
2,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,-0.168576,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,2,0
3,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,3,2
4,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,0.041399,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,4,0


Unnamed: 0,txId1,txId2
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9


#### Split dataset masks

In [4]:
# take time step from 1 to 34 as train data
df_train = df_merge[df_merge[1] <= 34]
# take rest as test data
df_test = df_merge[df_merge[1] > 34]
display(df_train.head())
display(df_test.head())

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,txId,class
0,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,-0.167933,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,0,0
1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,-0.167948,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,1,0
2,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,-0.168576,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,2,0
3,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,3,2
4,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,0.041399,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,4,0


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,txId,class
136265,35,-0.159837,-0.030732,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.150191,-0.156195,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,136265,0
136266,35,-0.165893,-0.029572,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.156388,-0.162304,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,136266,0
136267,35,-0.129693,0.070098,1.573595,-0.12197,0.075226,-0.113002,-0.061584,-0.119348,-0.125783,...,-0.463356,-0.462939,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,136267,0
136268,35,-0.111789,1.29491,1.573595,0.553368,-0.043875,0.641758,-0.061584,-0.159732,-0.158279,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,136268,0
136269,35,-0.172796,-0.081127,-1.201369,-0.046932,-0.043875,-0.02914,-0.061584,-0.163571,-0.169343,...,-0.979074,-0.978556,0.018279,0.854508,2.146417,2.013077,-1.760926,-1.760984,136269,0


In [5]:
num_features = 94

# split train and test features
train_features = df_train.iloc[:, 1:num_features+1].values
test_features = df_test.iloc[:, 1:num_features+1].values

# split train and test labels, label is last column
train_labels = df_train.iloc[:, -1].values
test_labels = df_test.iloc[:, -1].values


print("Train features shape: ", train_features.shape)
print("Train labels shape: ", train_labels.shape)

print("Test features shape: ", test_features.shape)
print("Test labels shape: ", test_labels.shape)

Train features shape:  (136265, 94)
Train labels shape:  (136265,)
Test features shape:  (67504, 94)
Test labels shape:  (67504,)


In [6]:
edges = df_edges.copy()
edges = edges.astype(int)

all_edge_index = np.array(edges.values).T

all_edge_index = torch.tensor(all_edge_index, dtype=torch.long).contiguous()
print(all_edge_index.shape)

torch.Size([2, 234355])


In [7]:
# maske with same time step
train_edge_mask = (edges["txId1"].isin(df_train["txId"].values)) & (edges["txId2"].isin(df_train["txId"].values))
train_edge_index = all_edge_index[:, train_edge_mask.values]
print(train_edge_index.shape)

torch.Size([2, 156843])


In [8]:
# edge mask for test data
test_edge_mask = (edges["txId1"].isin(df_test["txId"].values)) & (edges["txId2"].isin(df_test["txId"].values))
test_edge_index = edges[["txId1", "txId2"]][test_edge_mask.values]

# map test edge index value and test node index
test_nodes = df_test["txId"].values
map_test_nodes = {j:i for i,j in enumerate(test_nodes)}

test_edge_index["txId1"] = test_edge_index["txId1"].map(map_test_nodes)
test_edge_index["txId2"] = test_edge_index["txId2"].map(map_test_nodes)

test_edge_index = np.array(test_edge_index.values).T
test_edge_index = torch.tensor(test_edge_index, dtype=torch.long).contiguous()
print(test_edge_index.shape)

torch.Size([2, 77512])


In [9]:
# construct graph train data and test data
train_graph = Data(x=torch.tensor(train_features, dtype=torch.double), edge_index=train_edge_index, y=torch.tensor(train_labels, dtype=torch.long))
test_graph = Data(x=torch.tensor(test_features, dtype=torch.double), edge_index=test_edge_index, y=torch.tensor(test_labels, dtype=torch.long))
print(train_graph)
print(test_graph)

Data(x=[136265, 94], edge_index=[2, 156843], y=[136265])
Data(x=[67504, 94], edge_index=[2, 77512], y=[67504])


#### GCN Model and Training

In [10]:
# 2-layer GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels=128):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
        self.dropout = Dropout(0.5)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x

In [11]:
device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [12]:
def train(model, data, optimizer):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    logits = model(data.x, data.edge_index)
    loss = F.cross_entropy(logits, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(device)
        logits = model(data.x, data.edge_index)
        pred = logits.argmax(dim=1)
        y_cpu = data.y.cpu() 
        pred_cpu = pred.cpu() 
        acc = accuracy_score(y_cpu, pred_cpu)
    return acc

In [15]:
model = GCN(num_features, 3).to(device)
num_epochs = 200
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

pbr = tqdm(range(1, num_epochs+1))
for epoch in pbr:
    model = model.double()
    loss = train(model, train_graph, optimizer)
    acc = test(model, test_graph)
    pbr.set_description(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')

Epoch: 200, Loss: 0.5310, Acc: 0.7496: 100%|██████████| 200/200 [00:05<00:00, 34.89it/s]


In [16]:
# test accuracy
test_acc = test(model, test_graph.to(device))
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.7496
