# Base Models for BTCGraphGuard

**Authors: Xuhui Zhan, Tianhao Qu, Siyu Yang**


## Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import networkx as nx
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.nn import SAGEConv
import time



In [2]:
# Prepare Data
data_root = './elliptic_bitcoin_dataset'
elliptic_txs_features = pd.read_csv(os.path.join(data_root, 'elliptic_txs_features.csv'), header=None)
elliptic_txs_edgelist = pd.read_csv(os.path.join(data_root, 'elliptic_txs_edgelist.csv'))
elliptic_txs_classes = pd.read_csv(os.path.join(data_root, 'elliptic_txs_classes.csv'))

elliptic_txs_features.columns = ['txId'] + [f'V{i}' for i in range(1, 167)]


In [3]:
print(elliptic_txs_features.shape)
print(elliptic_txs_edgelist.shape)
print(elliptic_txs_classes.shape)


(203769, 167)
(234355, 2)
(203769, 2)


In [4]:
elliptic_txs_classes['class_mapped'] = elliptic_txs_classes['class'].replace({'1': 'illicit', '2': 'licit'})

In [5]:
# Create Graph
G = nx.from_pandas_edgelist(elliptic_txs_edgelist, 'txId1', 'txId2')

## Random seed settings

In [6]:
RANDOM_STATE = 42
NUM_EPOCHS = 30

In [7]:
def set_seed_for_torch(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)      # For single-GPU.
        torch.cuda.manual_seed_all(seed)  # For multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def set_seed_for_numpy(seed):
    np.random.seed(seed) 
    
def set_seed_for_random(seed):
    random.seed(seed)  

In [8]:
set_seed_for_torch(RANDOM_STATE)
set_seed_for_numpy(RANDOM_STATE)
set_seed_for_random(RANDOM_STATE)

## EDA

In [9]:
# Spaceholders for EDA

## Preprocess Data

In [10]:
tx_id_mapping = {tx_id: idx for idx, tx_id in enumerate(elliptic_txs_features['txId'])}

# Create an explicit copy of the filtered DataFrame
edges_with_features = elliptic_txs_edgelist[elliptic_txs_edgelist['txId1'].isin(list(tx_id_mapping.keys())) & 
                                           elliptic_txs_edgelist['txId2'].isin(list(tx_id_mapping.keys()))].copy()

# Now use loc to set values (though with copy() above, direct assignment would also work)
edges_with_features.loc[:, 'Id1'] = edges_with_features['txId1'].map(tx_id_mapping)
edges_with_features.loc[:, 'Id2'] = edges_with_features['txId2'].map(tx_id_mapping)

In [11]:
edge_index = torch.tensor(edges_with_features[['Id1', 'Id2']].values.T, dtype=torch.long)
node_features = torch.tensor(elliptic_txs_features.drop(columns=['txId']).values, 
                             dtype=torch.float)

In [12]:
le = LabelEncoder()
class_labels = le.fit_transform(elliptic_txs_classes['class'])
node_labels = torch.tensor(class_labels, dtype=torch.long)
original_labels = le.inverse_transform(class_labels)

In [13]:
data = Data(x=node_features, 
            edge_index=edge_index, 
            y=node_labels)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
known_mask   = (data.y == 0) | (data.y == 1)  # Only nodes with known labels licit or illicit
unknown_mask = data.y == 2                    # Nodes with unknown labels

In [16]:
num_known_nodes = known_mask.sum().item()
permutations = torch.randperm(num_known_nodes)
train_size = int(0.8 * num_known_nodes)
val_size = int(0.1 * num_known_nodes)
test_size = num_known_nodes - train_size - val_size

total = np.sum([train_size, val_size, test_size])

print(f"""Number of observations per split
    Training   : {train_size:10,} ({100*train_size/total:0.2f} %)
    Validation : {val_size:10,} ({100*val_size/total:0.2f} %)
    Testing    : {test_size:10,} ({100*test_size/total:0.2f} %)
""")

Number of observations per split
    Training   :     37,251 (80.00 %)
    Validation :      4,656 (10.00 %)
    Testing    :      4,657 (10.00 %)



In [17]:
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

train_indices = known_mask.nonzero(as_tuple=True)[0][permutations[:train_size]]
val_indices = known_mask.nonzero(as_tuple=True)[0][permutations[train_size:train_size + val_size]]
test_indices = known_mask.nonzero(as_tuple=True)[0][permutations[train_size + val_size:]]

data.train_mask[train_indices] = True
data.val_mask[val_indices] = True
data.test_mask[test_indices] = True

print(len(data.train_mask))

203769


## Graph attention network (GAT)

In [21]:
# Create output directory if it doesn't exist
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
data = data.to(device)

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

gat_model = GAT(input_dim, hidden_dim, output_dim).to(device)
optimizer_gat = torch.optim.Adam(gat_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_gat():
    gat_model.train()
    optimizer_gat.zero_grad()
    out = gat_model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer_gat.step()
    return loss.item()

def evaluate_gat(mask):
    gat_model.eval()
    with torch.no_grad():
        out = gat_model(data)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data.y[mask]
        acc = int(correct.sum()) / int(mask.sum())
    return acc

num_epochs = 200
best_val_acc_gat = 0
best_model_gat = None

start_time = time.time()
for epoch in range(1, num_epochs + 1):
    loss = train_gat()
    train_acc = evaluate_gat(data.train_mask)
    val_acc = evaluate_gat(data.val_mask)
    if val_acc > best_val_acc_gat:
        best_val_acc_gat = val_acc
        best_model_gat = gat_model.state_dict()
        torch.save(best_model_gat, os.path.join(output_dir, "gat_best_model.pt"))
    if epoch % 10 == 0:
        print(f'GAT Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
gat_training_time = time.time() - start_time
gat_model.load_state_dict(best_model_gat)
test_acc_gat = evaluate_gat(data.test_mask)
print(f'GAT Test Accuracy: {test_acc_gat:.4f}')
print(f'GAT Training Time: {gat_training_time:.2f} seconds')



GAT Epoch: 010, Loss: 1.1814, Train Acc: 0.5571, Val Acc: 0.5533
GAT Epoch: 020, Loss: 0.8016, Train Acc: 0.7138, Val Acc: 0.7083
GAT Epoch: 030, Loss: 0.6742, Train Acc: 0.7323, Val Acc: 0.7249
GAT Epoch: 040, Loss: 0.5796, Train Acc: 0.8281, Val Acc: 0.8250
GAT Epoch: 050, Loss: 0.5292, Train Acc: 0.8432, Val Acc: 0.8393
GAT Epoch: 060, Loss: 0.4952, Train Acc: 0.8567, Val Acc: 0.8501
GAT Epoch: 070, Loss: 0.4713, Train Acc: 0.8655, Val Acc: 0.8540
GAT Epoch: 080, Loss: 0.4548, Train Acc: 0.8658, Val Acc: 0.8542
GAT Epoch: 090, Loss: 0.4361, Train Acc: 0.8837, Val Acc: 0.8752
GAT Epoch: 100, Loss: 0.4261, Train Acc: 0.8617, Val Acc: 0.8512
GAT Epoch: 110, Loss: 0.4141, Train Acc: 0.8918, Val Acc: 0.8829
GAT Epoch: 120, Loss: 0.4030, Train Acc: 0.8801, Val Acc: 0.8709
GAT Epoch: 130, Loss: 0.3898, Train Acc: 0.8982, Val Acc: 0.8922
GAT Epoch: 140, Loss: 0.3816, Train Acc: 0.9193, Val Acc: 0.9098
GAT Epoch: 150, Loss: 0.3727, Train Acc: 0.9036, Val Acc: 0.8984
GAT Epoch: 160, Loss: 0.3

## GraphSAGE

In [22]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

sage_model = GraphSAGE(input_dim, hidden_dim, output_dim).to(device)
optimizer_sage = torch.optim.Adam(sage_model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_sage():
    sage_model.train()
    optimizer_sage.zero_grad()
    out = sage_model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer_sage.step()
    return loss.item()

def evaluate_sage(mask):
    sage_model.eval()
    with torch.no_grad():
        out = sage_model(data)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data.y[mask]
        acc = int(correct.sum()) / int(mask.sum())
    return acc

num_epochs = 200
best_val_acc_sage = 0
best_model_sage = None

start_time = time.time()
for epoch in range(1, num_epochs + 1):
    loss = train_sage()
    train_acc = evaluate_sage(data.train_mask)
    val_acc = evaluate_sage(data.val_mask)
    if val_acc > best_val_acc_sage:
        best_val_acc_sage = val_acc
        best_model_sage = sage_model.state_dict()
        # Save checkpoint in output folder
        torch.save(best_model_sage, os.path.join(output_dir, "sage_best_model.pt"))
    if epoch % 10 == 0:
        print(f'GraphSAGE Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
sage_training_time = time.time() - start_time
sage_model.load_state_dict(best_model_sage)
test_acc_sage = evaluate_sage(data.test_mask)
print(f'GraphSAGE Test Accuracy: {test_acc_sage:.4f}')
print(f'GraphSAGE Training Time: {sage_training_time:.2f} seconds')

GraphSAGE Epoch: 010, Loss: 0.2360, Train Acc: 0.9227, Val Acc: 0.9180
GraphSAGE Epoch: 020, Loss: 0.1739, Train Acc: 0.9411, Val Acc: 0.9369
GraphSAGE Epoch: 030, Loss: 0.1480, Train Acc: 0.9621, Val Acc: 0.9620
GraphSAGE Epoch: 040, Loss: 0.1335, Train Acc: 0.9653, Val Acc: 0.9637
GraphSAGE Epoch: 050, Loss: 0.1222, Train Acc: 0.9691, Val Acc: 0.9693
GraphSAGE Epoch: 060, Loss: 0.1146, Train Acc: 0.9725, Val Acc: 0.9729
GraphSAGE Epoch: 070, Loss: 0.1061, Train Acc: 0.9748, Val Acc: 0.9747
GraphSAGE Epoch: 080, Loss: 0.1012, Train Acc: 0.9763, Val Acc: 0.9762
GraphSAGE Epoch: 090, Loss: 0.0970, Train Acc: 0.9771, Val Acc: 0.9766
GraphSAGE Epoch: 100, Loss: 0.0916, Train Acc: 0.9788, Val Acc: 0.9770
GraphSAGE Epoch: 110, Loss: 0.0906, Train Acc: 0.9788, Val Acc: 0.9770
GraphSAGE Epoch: 120, Loss: 0.0874, Train Acc: 0.9794, Val Acc: 0.9770
GraphSAGE Epoch: 130, Loss: 0.0857, Train Acc: 0.9802, Val Acc: 0.9785
GraphSAGE Epoch: 140, Loss: 0.0823, Train Acc: 0.9805, Val Acc: 0.9796
GraphS