In [16]:
import torch_geometric

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

# ref: https://www.kaggle.com/code/divyareddyyeruva/elliptic-gcn-pyg

#### Import dataset

In [17]:
# import data 
df_features = pd.read_csv('data/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("data/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("data/elliptic_txs_classes.csv")
# map unknown classes to 0
df_classes['class'] = df_classes['class'].apply(lambda x: 0 if x == "unknown" else int(x))

# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
df_merge.drop(0, axis=1, inplace=True)

# check if there are duplicate txId
print("Number of duplicate txId: ", df_merge.duplicated(subset=['txId']).sum())


Number of duplicate txId:  0


In [18]:
# remove unknown classes from df_merge
df_merge = df_merge[df_merge["class"] != 0]

display(df_merge.shape)
display(df_edges.shape)

(46564, 168)

(234355, 2)

#### Split dataset masks

In [19]:
# take time step from 1 to 34 as train data
df_train = df_merge[df_merge[1] <= 34]
# take rest as test data
df_test = df_merge[df_merge[1] > 34]
display(df_train)
display(df_test)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,txId,class
3,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792,232438397,2
9,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,-0.144554,...,0.241128,0.241406,0.604120,0.008632,-0.131155,0.333211,-0.120613,-0.119792,232029206,2
10,1,-0.147852,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.137933,-0.144108,...,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,232344069,2
11,1,-0.151357,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.141519,-0.147643,...,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,27553029,2
16,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.029140,0.242712,-0.163640,-0.169115,...,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.054450,-1.760926,-1.760984,3881097,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136241,34,-0.172968,-0.071395,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163627,-0.169442,...,1.461330,1.461369,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,119222131,2
136243,34,-0.172924,-0.107411,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163583,-0.169398,...,0.241128,0.241406,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,77410785,1
136249,34,-0.172897,-0.070152,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163555,-0.169371,...,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,288397726,2
136250,34,-0.155367,-0.081852,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.145619,-0.151686,...,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,227494727,2


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,txId,class
136276,35,-0.172982,-0.055242,-1.201369,-0.121970,-0.024025,-0.113002,-0.061584,-0.163642,-0.169456,...,0.241128,0.241406,-0.216057,-0.125939,-0.131155,-0.269818,-0.120613,-0.119792,54751137,2
136277,35,-0.166832,-0.115508,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.157351,-0.163254,...,-0.979074,-0.978556,0.018279,-0.049041,-0.038193,-0.011377,-1.760926,-1.760984,67576672,2
136278,35,-0.167233,-0.115086,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.157761,-0.163658,...,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,69767012,2
136279,35,-0.172509,-0.120473,-0.091383,-0.121970,-0.043875,-0.113002,-0.061584,-0.163159,-0.168980,...,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984,70384401,1
136280,35,-0.172805,-0.112290,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163461,-0.169278,...,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,67603017,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203752,49,-0.159293,-0.037276,1.018602,-0.121970,0.035526,-0.113002,-0.061584,-0.149635,-0.155646,...,0.231244,-0.388216,-0.098889,1.931078,3.168259,3.707301,-1.390548,-1.214035,80329479,2
203754,49,-0.172962,-0.126566,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163622,-0.169437,...,0.241128,0.241406,10.914916,1.700384,-0.131155,7.914145,-0.120613,-0.119792,158406298,2
203759,49,-0.170412,-0.078164,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163631,-0.167106,...,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,158375075,1
203763,49,-0.093732,-0.116160,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.082559,-0.089510,...,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,147478192,2


In [20]:
num_features = 94

# split train and test features
train_features = df_train.iloc[:, 1:num_features+1].values
test_features = df_test.iloc[:, 1:num_features+1].values

# split train and test labels, label is last column
train_labels = df_train.iloc[:, -1].values
test_labels = df_test.iloc[:, -1].values


print("Train features shape: ", train_features.shape)
print("Train labels shape: ", train_labels.shape)

print("Test features shape: ", test_features.shape)
print("Test labels shape: ", test_labels.shape)

Train features shape:  (29894, 94)
Train labels shape:  (29894,)
Test features shape:  (16670, 94)
Test labels shape:  (16670,)


In [21]:
edges = df_edges.copy()
edges = edges.astype(int)

# not used
all_edge_index = np.array(edges.values).T
all_edge_index = torch.tensor(all_edge_index, dtype=torch.long).contiguous()
print(all_edge_index.shape)

torch.Size([2, 234355])


In [22]:
# maske with same time step
train_edge_mask = (edges["txId1"].isin(df_train["txId"].values)) & (edges["txId2"].isin(df_train["txId"].values))
train_edge_index = edges[["txId1", "txId2"]][train_edge_mask.values]

train_nodes = df_train["txId"].values
map_train_id = {j:i for i,j in enumerate(train_nodes)}

train_edge_index["txId1"] = train_edge_index["txId1"].map(map_train_id)
train_edge_index["txId2"] = train_edge_index["txId2"].map(map_train_id)

train_edge_index = np.array(train_edge_index.values).T
train_edge_index = torch.tensor(train_edge_index, dtype=torch.long).contiguous()

print(train_edge_index.shape)

torch.Size([2, 22898])


In [23]:
# edge mask for test data
test_edge_mask = (edges["txId1"].isin(df_test["txId"].values)) & (edges["txId2"].isin(df_test["txId"].values))
test_edge_index = edges[["txId1", "txId2"]][test_edge_mask.values]

# map test edge index value and test node index
test_nodes = df_test["txId"].values
map_test_nodes = {j:i for i,j in enumerate(test_nodes)}

test_edge_index["txId1"] = test_edge_index["txId1"].map(map_test_nodes)
test_edge_index["txId2"] = test_edge_index["txId2"].map(map_test_nodes)

test_edge_index = np.array(test_edge_index.values).T
test_edge_index = torch.tensor(test_edge_index, dtype=torch.long).contiguous()
print(test_edge_index.shape)

torch.Size([2, 13726])


In [24]:
# construct graph train data and test data
train_graph = Data(x=torch.tensor(train_features, dtype=torch.double), edge_index=train_edge_index, y=torch.tensor(train_labels, dtype=torch.long))
test_graph = Data(x=torch.tensor(test_features, dtype=torch.double), edge_index=test_edge_index, y=torch.tensor(test_labels, dtype=torch.long))
print(train_graph)
print(test_graph)

Data(x=[29894, 94], edge_index=[2, 22898], y=[29894])
Data(x=[16670, 94], edge_index=[2, 13726], y=[16670])


#### GCN Model and Training

In [25]:
# 2-layer GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels=128):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
        self.dropout = Dropout(0.5)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [27]:
def train(model, data, optimizer):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    logits = model(data.x, data.edge_index)
    loss = F.cross_entropy(logits, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data):
    model.eval()
    with torch.no_grad():
        data = data.to(device)
        logits = model(data.x, data.edge_index)
        pred = logits.argmax(dim=1)
        y_cpu = data.y.cpu() 
        pred_cpu = pred.cpu() 
        acc = accuracy_score(y_cpu, pred_cpu)
    return acc

In [28]:
model = GCN(num_features, 3).to(device)
num_epochs = 200
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

pbr = tqdm(range(1, num_epochs+1))
for epoch in pbr:
    model = model.double()
    loss = train(model, train_graph, optimizer)
    acc = test(model, test_graph)
    pbr.set_description(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')

Epoch: 200, Loss: 0.2780, Acc: 0.8591: 100%|██████████| 200/200 [00:01<00:00, 112.87it/s]


In [29]:
# test accuracy
test_acc = test(model, test_graph.to(device))
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.8591
