In [1]:
import torch_geometric

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

# ref: https://www.kaggle.com/code/divyareddyyeruva/elliptic-gcn-pyg

#### Import dataset

In [2]:
# import data 
df_features = pd.read_csv('data/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("data/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("data/elliptic_txs_classes.csv")
# map unknown classes to -1
df_classes['class'] = df_classes['class'].apply(lambda x: 0 if x == "unknown" else int(x))

# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
display(df_merge.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,txId,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,230425980,0
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,5530458,0
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,232022460,0
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,232438397,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,230460314,0


#### Split dataset masks

In [3]:
# take time step from 1 to 34 as train data
df_train = df_merge[df_merge[1] <= 34]
# take rest as test data
df_test = df_merge[df_merge[1] > 34]
display(df_train.head())
display(df_test.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,txId,class
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,230425980,0
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,5530458,0
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,232022460,0
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,232438397,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,230460314,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,txId,class
136265,54785412,35,-0.159837,-0.030732,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.150191,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,54785412,0
136266,69354384,35,-0.165893,-0.029572,1.018602,-0.12197,-0.043875,-0.113002,-0.061584,-0.156388,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,69354384,0
136267,54775772,35,-0.129693,0.070098,1.573595,-0.12197,0.075226,-0.113002,-0.061584,-0.119348,...,-0.463356,-0.462939,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,54775772,0
136268,69343934,35,-0.111789,1.29491,1.573595,0.553368,-0.043875,0.641758,-0.061584,-0.159732,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,69343934,0
136269,70102750,35,-0.172796,-0.081127,-1.201369,-0.046932,-0.043875,-0.02914,-0.061584,-0.163571,...,-0.979074,-0.978556,0.018279,0.854508,2.146417,2.013077,-1.760926,-1.760984,70102750,0


In [4]:
num_features = 94

# split train and test features
train_features = df_train.iloc[:, 2:2+num_features].values
test_features = df_test.iloc[:, 2:2+num_features].values

# split train and test labels
train_labels = df_train.iloc[:, -1].values
test_labels = df_test.iloc[:, -1].values

print("Train features shape: ", train_features.shape)
print("Train labels shape: ", train_labels.shape)

print("Test features shape: ", test_features.shape)
print("Test labels shape: ", test_labels.shape)

Train features shape:  (136265, 94)
Train labels shape:  (136265,)
Test features shape:  (67504, 94)
Test labels shape:  (67504,)


In [5]:
# all nodes in data
nodes = df_merge[0].values
map_id = {j:i for i,j in enumerate(nodes)} # mapping nodes to indexes

edges = df_edges.copy()
edges.txId1 = edges.txId1.map(map_id)
edges.txId2 = edges.txId2.map(map_id)
edges = edges.astype(int)

edge_index = np.array(edges.values).T

edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
edge_weight = torch.tensor([1]* edge_index.shape[1] , dtype=torch.double)
print(edge_index.shape)

torch.Size([2, 234355])


In [6]:
train_nodes = df_train[0].unique()
map_id = {j:i for i,j in enumerate(nodes)}
train_idx = [map_id[node_id] for node_id in train_nodes]

In [7]:
train_edge_mask = edges['txId1'].isin(train_idx) & edges['txId2'].isin(train_idx)
train_edge_index = edge_index[:, train_edge_mask]
train_edge_weight = edge_weight[train_edge_mask]
train_edge_index.shape

torch.Size([2, 156843])

In [8]:
# inverse mapping for test data
test_nodes = df_test[0].unique()
map_id = {j:i for i,j in enumerate(nodes)}
test_idx = [map_id[node_id] for node_id in test_nodes]

test_edge_mask = edges['txId1'].isin(test_idx) & edges['txId2'].isin(test_idx)
test_edge_index = edge_index[:, test_edge_mask]
test_edge_weight = edge_weight[test_edge_mask]
test_edge_index.shape

torch.Size([2, 77512])

In [27]:
new_test_index = test_edge_index - 136265
new_test_index = new_test_index.long()
type(new_test_index)


torch.Tensor

In [28]:
# construct graph train data and test data
train_graph = Data(x=torch.tensor(train_features, dtype=torch.double), edge_index=train_edge_index, edge_weight=train_edge_weight, y=torch.tensor(train_labels, dtype=torch.long))
# train_graph = Data(x=torch.tensor(train_features, dtype=torch.float), edge_index=edge_index, edge_weight=edge_weight, y=torch.tensor(train_labels, dtype=torch.double))
test_graph = Data(x=torch.tensor(test_features, dtype=torch.double), edge_index=new_test_index, edge_weight=test_edge_weight, y=torch.tensor(test_labels, dtype=torch.long))
print(train_graph)
print(test_graph)

Data(x=[136265, 94], edge_index=[2, 156843], y=[136265], edge_weight=[156843])
Data(x=[67504, 94], edge_index=[2], y=[67504], edge_weight=[77512])


#### GCN Model and Training

In [10]:
# 2-layer GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels=128):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
        
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_weight
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return F.log_softmax(x, dim=1)

In [11]:
device = torch.device('cpu')

device(type='cpu')

In [12]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.cross_entropy(out, data.y)
    loss.backward()
    optimizer.step()
    return loss

def test(model, data):
    model.eval()
    with torch.no_grad():
        # logits = model(data.x, data.edge_index)
        logits = model(data)
        preds = logits.argmax(dim=1)
        acc = accuracy_score(data.y.cpu(), preds.cpu())
    return acc

In [13]:
model = GCN(num_features, 3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
num_epochs = 200
print(model)

# for epoch in tqdm(range(1, num_epochs+1)):
for epoch in range(1, num_epochs+1):
    model = model.double()
    train_loss = train(model, train_graph.to(device), optimizer)
    # train_acc = test(model, train_graph)
    # test_acc = test(model, test_graph)
    # if epoch % 10 == 0:
    print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, ')

GCN(
  (conv1): GCNConv(94, 128)
  (conv2): GCNConv(128, 3)
)
Epoch: 1, Train Loss: 1.2579, 
Epoch: 2, Train Loss: 0.7184, 
Epoch: 3, Train Loss: 0.6313, 
Epoch: 4, Train Loss: 0.6341, 
Epoch: 5, Train Loss: 0.6375, 
Epoch: 6, Train Loss: 0.6326, 
Epoch: 7, Train Loss: 0.6004, 
Epoch: 8, Train Loss: 0.5883, 
Epoch: 9, Train Loss: 0.5670, 
Epoch: 10, Train Loss: 0.5450, 
Epoch: 11, Train Loss: 0.5433, 
Epoch: 12, Train Loss: 0.5336, 
Epoch: 13, Train Loss: 0.5286, 
Epoch: 14, Train Loss: 0.5244, 
Epoch: 15, Train Loss: 0.5150, 
Epoch: 16, Train Loss: 0.5104, 
Epoch: 17, Train Loss: 0.5072, 
Epoch: 18, Train Loss: 0.5038, 
Epoch: 19, Train Loss: 0.4998, 
Epoch: 20, Train Loss: 0.4959, 
Epoch: 21, Train Loss: 0.4949, 
Epoch: 22, Train Loss: 0.4921, 
Epoch: 23, Train Loss: 0.4889, 
Epoch: 24, Train Loss: 0.4873, 
Epoch: 25, Train Loss: 0.4890, 
Epoch: 26, Train Loss: 0.4830, 
Epoch: 27, Train Loss: 0.4799, 
Epoch: 28, Train Loss: 0.4800, 
Epoch: 29, Train Loss: 0.4768, 
Epoch: 30, Train Lo

In [21]:
# test accuracy
test_acc = test(model, test_graph.to(device))
print(f"Test Accuracy: {test_acc:.4f}")

ValueError: `MessagePassing.propagate` only supports integer tensors of shape `[2, num_messages]`, `torch_sparse.SparseTensor` or `torch.sparse.Tensor` for argument `edge_index`.