In [212]:
import torch_geometric

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

# ref: https://www.kaggle.com/code/divyareddyyeruva/elliptic-gcn-pyg

#### Import dataset

In [213]:
# import data 
df_features = pd.read_csv('data/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("data/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("data/elliptic_txs_classes.csv")
# map unknown classes to -1
df_classes['class'] = df_classes['class'].apply(lambda x: -1 if x == "unknown" else int(x))

# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
display(df_merge.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,txId,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,230425980,-1
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,5530458,-1
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,232022460,-1
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,232438397,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,230460314,-1


#### Split dataset masks

In [214]:
# take time step from 1 to 34 as train data
df_train = df_merge[df_merge[1] <= 34]
# take rest as test data
df_test = df_merge[df_merge[1] > 34]
display(df_train.head())
display(df_test.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,txId,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,230425980,-1
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,5530458,-1
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,232022460,-1
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,232438397,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,230460314,-1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,txId,class
136265,54785412,35,-0.159837,-0.030732,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.150191,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,54785412,-1
136266,69354384,35,-0.165893,-0.029572,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.156388,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,69354384,-1
136267,54775772,35,-0.129693,0.070098,1.573595,-0.12197,0.075226,-0.113002,-0.061584,-0.119348,...,-0.463356,-0.462939,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,54775772,-1
136268,69343934,35,-0.111789,1.29491,1.573595,0.553368,-0.043875,0.641758,-0.061584,-0.159732,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,69343934,-1
136269,70102750,35,-0.172796,-0.081127,-1.201369,-0.046932,-0.043875,-0.02914,-0.061584,-0.163571,...,-0.979074,-0.978556,0.018279,0.854508,2.146417,2.013077,-1.760926,-1.760984,70102750,-1


In [215]:
num_features = 94

# split train and test features
train_features = df_train.iloc[:, 2:2+num_features].values
test_features = df_test.iloc[:, 2:2+num_features].values

# split train and test labels
train_labels = df_train.iloc[:, -1].values
test_labels = df_test.iloc[:, -1].values

print("Train features shape: ", train_features.shape)
print("Train labels shape: ", train_labels.shape)

print("Test features shape: ", test_features.shape)
print("Test labels shape: ", test_labels.shape)

Train features shape:  (136265, 94)
Train labels shape:  (136265,)
Test features shape:  (67504, 94)
Test labels shape:  (67504,)


In [216]:
# all nodes in data
nodes = df_merge[0].values
map_id = {j:i for i,j in enumerate(nodes)} # mapping nodes to indexes

edges = df_edges.copy()
edges.txId1 = edges.txId1.map(map_id)
edges.txId2 = edges.txId2.map(map_id)
edges = edges.astype(int)

edge_index = np.array(edges.values).T

edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
edge_weight = torch.tensor([1]* edge_index.shape[1] , dtype=torch.double)
print(edge_index.shape)

torch.Size([2, 234355])


In [222]:
# construct graph train data and test data
train_graph = Data(x=torch.tensor(train_features, dtype=torch.float), edge_index=edge_index, edge_weight=edge_weight, y=torch.tensor(train_labels, dtype=torch.double))
test_graph = Data(x=torch.tensor(test_features, dtype=torch.float), edge_index=edge_index, edge_weight=edge_weight, y=torch.tensor(test_labels, dtype=torch.double))
print(train_graph)
print(test_graph)

Data(x=[136265, 94], edge_index=[2, 234355], y=[136265], edge_weight=[234355])
Data(x=[67504, 94], edge_index=[2, 234355], y=[67504], edge_weight=[234355])


#### GCN Model and Training

In [241]:
# 2-layer GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels=128):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
        
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return F.log_softmax(x, dim=1)

In [242]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out, data.y)
    loss.backward()
    optimizer.step()
    return loss

def test(model, data):
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    acc = accuracy_score(data.y, pred)
    return acc

In [243]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [244]:
model = GCN(num_features, 3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
num_epochs = 200

for epoch in tqdm(range(1, num_epochs+1)):
    train_loss = train(model, train_graph, optimizer)
    train_acc = test(model, train_graph)
    test_acc = test(model, test_graph)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, '
              f'Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

  0%|          | 0/200 [00:00<?, ?it/s]


RuntimeError: index 136266 is out of bounds for dimension 0 with size 136265