# Base Models for BTCGraphGuard

**Authors: Xuhui Zhan, Tianhao Qu, Siyu Yang**


## Import Libraries

In [28]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import networkx as nx
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import from_networkx
import time

In [3]:
# Prepare Data
data_root = 'data/data/elliptic_bitcoin_dataset'
elliptic_txs_features = pd.read_csv(os.path.join(data_root, 'elliptic_txs_features.csv'), header=None)
elliptic_txs_edgelist = pd.read_csv(os.path.join(data_root, 'elliptic_txs_edgelist.csv'))
elliptic_txs_classes = pd.read_csv(os.path.join(data_root, 'elliptic_txs_classes.csv'))

elliptic_txs_features.columns = ['txId'] + [f'V{i}' for i in range(1, 167)]


In [4]:
print(elliptic_txs_features.shape)
print(elliptic_txs_edgelist.shape)
print(elliptic_txs_classes.shape)


(203769, 167)
(234355, 2)
(203769, 2)


In [7]:
elliptic_txs_classes['class_mapped'] = elliptic_txs_classes['class'].replace({'1': 'illicit', '2': 'licit'})

In [8]:
# Create Graph
G = nx.from_pandas_edgelist(elliptic_txs_edgelist, 'txId1', 'txId2')

## Random seed settings

In [9]:
RANDOM_STATE = 42
NUM_EPOCHS = 30

In [10]:
def set_seed_for_torch(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)      # For single-GPU.
        torch.cuda.manual_seed_all(seed)  # For multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def set_seed_for_numpy(seed):
    np.random.seed(seed) 
    
def set_seed_for_random(seed):
    random.seed(seed)  

In [11]:
set_seed_for_torch(RANDOM_STATE)
set_seed_for_numpy(RANDOM_STATE)
set_seed_for_random(RANDOM_STATE)

## EDA

In [9]:
# Spaceholders for EDA

## Preprocess Data

In [17]:
tx_id_mapping = {tx_id: idx for idx, tx_id in enumerate(elliptic_txs_features['txId'])}

# Create an explicit copy of the filtered DataFrame
edges_with_features = elliptic_txs_edgelist[elliptic_txs_edgelist['txId1'].isin(list(tx_id_mapping.keys())) & 
                                           elliptic_txs_edgelist['txId2'].isin(list(tx_id_mapping.keys()))].copy()

# Now use loc to set values (though with copy() above, direct assignment would also work)
edges_with_features.loc[:, 'Id1'] = edges_with_features['txId1'].map(tx_id_mapping)
edges_with_features.loc[:, 'Id2'] = edges_with_features['txId2'].map(tx_id_mapping)

In [18]:
edge_index = torch.tensor(edges_with_features[['Id1', 'Id2']].values.T, dtype=torch.long)
node_features = torch.tensor(elliptic_txs_features.drop(columns=['txId']).values, 
                             dtype=torch.float)

In [19]:
le = LabelEncoder()
class_labels = le.fit_transform(elliptic_txs_classes['class'])
node_labels = torch.tensor(class_labels, dtype=torch.long)
original_labels = le.inverse_transform(class_labels)

In [20]:
data = Data(x=node_features, 
            edge_index=edge_index, 
            y=node_labels)

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
known_mask   = (data.y == 0) | (data.y == 1)  # Only nodes with known labels licit or illicit
unknown_mask = data.y == 2                    # Nodes with unknown labels

In [23]:
num_known_nodes = known_mask.sum().item()
permutations = torch.randperm(num_known_nodes)
train_size = int(0.8 * num_known_nodes)
val_size = int(0.1 * num_known_nodes)
test_size = num_known_nodes - train_size - val_size

total = np.sum([train_size, val_size, test_size])

print(f"""Number of observations per split
    Training   : {train_size:10,} ({100*train_size/total:0.2f} %)
    Validation : {val_size:10,} ({100*val_size/total:0.2f} %)
    Testing    : {test_size:10,} ({100*test_size/total:0.2f} %)
""")

Number of observations per split
    Training   :     37,251 (80.00 %)
    Validation :      4,656 (10.00 %)
    Testing    :      4,657 (10.00 %)


In [24]:
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

train_indices = known_mask.nonzero(as_tuple=True)[0][permutations[:train_size]]
val_indices = known_mask.nonzero(as_tuple=True)[0][permutations[train_size:train_size + val_size]]
test_indices = known_mask.nonzero(as_tuple=True)[0][permutations[train_size + val_size:]]

data.train_mask[train_indices] = True
data.val_mask[val_indices] = True
data.test_mask[test_indices] = True

print(len(data.train_mask))

203769


## Graph attention network (GAT)

In [26]:
# Create output directory if it doesn't exist
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
data = data.to(device)

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x
    
def infer_model_dimensions(features_df, classes_df):
    input_dim = features_df.shape[1] - 1

    hidden_dim = 64

    output_dim = classes_df['class'].dropna().nunique()

    return input_dim, hidden_dim, output_dim
features_df = pd.read_csv(os.path.join(data_root, 'elliptic_txs_features.csv'), header=None)
classes_df = pd.read_csv(os.path.join(data_root, 'elliptic_txs_classes.csv'))

input_dim, hidden_dim, output_dim = infer_model_dimensions(features_df,classes_df)
gat_model = GAT(input_dim, hidden_dim, output_dim).to(device)
optimizer_gat = torch.optim.Adam(gat_model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_gat():
    gat_model.train()
    optimizer_gat.zero_grad()
    out = gat_model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer_gat.step()
    return loss.item()

def evaluate_gat(mask):
    gat_model.eval()
    with torch.no_grad():
        out = gat_model(data)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data.y[mask]
        acc = int(correct.sum()) / int(mask.sum())
    return acc

num_epochs = 200
best_val_acc_gat = 0
best_model_gat = None

start_time = time.time()
for epoch in range(1, num_epochs + 1):
    loss = train_gat()
    train_acc = evaluate_gat(data.train_mask)
    val_acc = evaluate_gat(data.val_mask)
    if val_acc > best_val_acc_gat:
        best_val_acc_gat = val_acc
        best_model_gat = gat_model.state_dict()
        torch.save(best_model_gat, os.path.join(output_dir, "gat_best_model.pt"))
    if epoch % 10 == 0:
        print(f'GAT Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
gat_training_time = time.time() - start_time
gat_model.load_state_dict(best_model_gat)
test_acc_gat = evaluate_gat(data.test_mask)
print(f'GAT Test Accuracy: {test_acc_gat:.4f}')
print(f'GAT Training Time: {gat_training_time:.2f} seconds')



GAT Epoch: 010, Loss: 1.2484, Train Acc: 0.8938, Val Acc: 0.8868
GAT Epoch: 020, Loss: 0.9801, Train Acc: 0.5166, Val Acc: 0.5174
GAT Epoch: 030, Loss: 0.8122, Train Acc: 0.6572, Val Acc: 0.6486
GAT Epoch: 040, Loss: 0.6785, Train Acc: 0.7021, Val Acc: 0.7012
GAT Epoch: 050, Loss: 0.6059, Train Acc: 0.7289, Val Acc: 0.7262
GAT Epoch: 060, Loss: 0.5523, Train Acc: 0.7662, Val Acc: 0.7625
GAT Epoch: 070, Loss: 0.5170, Train Acc: 0.7904, Val Acc: 0.7872
GAT Epoch: 080, Loss: 0.4868, Train Acc: 0.8026, Val Acc: 0.8015
GAT Epoch: 090, Loss: 0.4698, Train Acc: 0.8352, Val Acc: 0.8295
GAT Epoch: 100, Loss: 0.4497, Train Acc: 0.8654, Val Acc: 0.8570
GAT Epoch: 110, Loss: 0.4336, Train Acc: 0.8882, Val Acc: 0.8802
GAT Epoch: 120, Loss: 0.4179, Train Acc: 0.8761, Val Acc: 0.8632
GAT Epoch: 130, Loss: 0.4059, Train Acc: 0.9075, Val Acc: 0.8988
GAT Epoch: 140, Loss: 0.3913, Train Acc: 0.9165, Val Acc: 0.9068
GAT Epoch: 150, Loss: 0.3875, Train Acc: 0.9171, Val Acc: 0.9087
GAT Epoch: 160, Loss: 0.3

## GraphSAGE

In [27]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

sage_model = GraphSAGE(input_dim, hidden_dim, output_dim).to(device)
optimizer_sage = torch.optim.Adam(sage_model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_sage():
    sage_model.train()
    optimizer_sage.zero_grad()
    out = sage_model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer_sage.step()
    return loss.item()

def evaluate_sage(mask):
    sage_model.eval()
    with torch.no_grad():
        out = sage_model(data)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data.y[mask]
        acc = int(correct.sum()) / int(mask.sum())
    return acc

num_epochs = 200
best_val_acc_sage = 0
best_model_sage = None

start_time = time.time()
for epoch in range(1, num_epochs + 1):
    loss = train_sage()
    train_acc = evaluate_sage(data.train_mask)
    val_acc = evaluate_sage(data.val_mask)
    if val_acc > best_val_acc_sage:
        best_val_acc_sage = val_acc
        best_model_sage = sage_model.state_dict()
        # Save checkpoint in output folder
        torch.save(best_model_sage, os.path.join(output_dir, "sage_best_model.pt"))
    if epoch % 10 == 0:
        print(f'GraphSAGE Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
sage_training_time = time.time() - start_time
sage_model.load_state_dict(best_model_sage)
test_acc_sage = evaluate_sage(data.test_mask)
print(f'GraphSAGE Test Accuracy: {test_acc_sage:.4f}')
print(f'GraphSAGE Training Time: {sage_training_time:.2f} seconds')

GraphSAGE Epoch: 010, Loss: 0.2511, Train Acc: 0.9148, Val Acc: 0.9055
GraphSAGE Epoch: 020, Loss: 0.1781, Train Acc: 0.9390, Val Acc: 0.9317
GraphSAGE Epoch: 030, Loss: 0.1513, Train Acc: 0.9582, Val Acc: 0.9570
GraphSAGE Epoch: 040, Loss: 0.1333, Train Acc: 0.9658, Val Acc: 0.9669
GraphSAGE Epoch: 050, Loss: 0.1209, Train Acc: 0.9695, Val Acc: 0.9712
GraphSAGE Epoch: 060, Loss: 0.1124, Train Acc: 0.9723, Val Acc: 0.9727
GraphSAGE Epoch: 070, Loss: 0.1062, Train Acc: 0.9739, Val Acc: 0.9749
GraphSAGE Epoch: 080, Loss: 0.1007, Train Acc: 0.9757, Val Acc: 0.9757
GraphSAGE Epoch: 090, Loss: 0.0956, Train Acc: 0.9768, Val Acc: 0.9768
GraphSAGE Epoch: 100, Loss: 0.0914, Train Acc: 0.9778, Val Acc: 0.9779
GraphSAGE Epoch: 110, Loss: 0.0899, Train Acc: 0.9788, Val Acc: 0.9779
GraphSAGE Epoch: 120, Loss: 0.0861, Train Acc: 0.9798, Val Acc: 0.9792
GraphSAGE Epoch: 130, Loss: 0.0841, Train Acc: 0.9803, Val Acc: 0.9787
GraphSAGE Epoch: 140, Loss: 0.0796, Train Acc: 0.9806, Val Acc: 0.9792
GraphS

# Subgraph Analysis Pipeline

In [None]:
# ----------- Subgraph Analysis Pipeline (Based on model output) -----------
import pandas as pd
import torch
from torch_geometric.utils import to_networkx
import networkx as nx

# Subgraph property analysis function
def analyze_subgraph_properties(subgraph_nodes, model_output):
    preds = model_output[subgraph_nodes].argmax(dim=1).cpu()

    illicit_ratio = (preds == 1).sum().item() / len(preds)
    licit_ratio = (preds == 2).sum().item() / len(preds)

    return {
        'num_nodes': len(subgraph_nodes),
        'illicit_ratio': illicit_ratio,
        'licit_ratio': licit_ratio
    }

# Full pipeline execution based on model output
def pipeline(data, model, device='cpu', min_nodes=5):
    model.eval()
    data = data.to(device)

    # Run model on full data
    with torch.no_grad():
        output = model(data)

    # Convert to NetworkX graph
    G_nx = to_networkx(data, to_undirected=True)

    # Identify connected components
    subgraphs = list(nx.connected_components(G_nx))

    results = []
    for nodes in subgraphs:
        if len(nodes) < min_nodes:
            continue
        try:
            node_indices = torch.tensor(list(nodes), dtype=torch.long)
            res = analyze_subgraph_properties(node_indices, output)
            results.append(res)
        except Exception as e:
            print(f"Skipping subgraph due to error: {e}")

    return pd.DataFrame(results)


## Analysis for GAT

In [32]:
results_df_GAT = pipeline(data, gat_model, device=device)

# -------- Investigate Subgraph Patterns --------
print("\nSubgraph Statistics Summary:\n")
print("Total subgraphs:", len(results_df_GAT))
print("Average illicit ratio:", results_df_GAT['illicit_ratio'].mean())
print("Average licit ratio:", results_df_GAT['licit_ratio'].mean())
print("Average subgraph size:", results_df_GAT['num_nodes'].mean())

print("\nDistribution of subgraph sizes:")
print(results_df_GAT['num_nodes'].describe())

print("\nTop 5 subgraphs with highest illicit ratio:")
print(results_df_GAT.sort_values('illicit_ratio', ascending=False).head())

print("\nTop 5 subgraphs with largest size:")
print(results_df_GAT.sort_values('num_nodes', ascending=False).head())


Subgraph Statistics Summary:

Total subgraphs: 49
Average illicit ratio: 0.9851612683412899
Average licit ratio: 0.0
Average subgraph size: 4158.551020408163

Distribution of subgraph sizes:
count      49.000000
mean     4158.551020
std      1592.470592
min      1089.000000
25%      2891.000000
50%      4291.000000
75%      5121.000000
max      7880.000000
Name: num_nodes, dtype: float64

Top 5 subgraphs with highest illicit ratio:
    num_nodes  illicit_ratio  licit_ratio
2        6621       0.999849          0.0
0        7880       0.999492          0.0
3        5693       0.999297          0.0
1        4544       0.999120          0.0
15       2975       0.998655          0.0

Top 5 subgraphs with largest size:
    num_nodes  illicit_ratio  licit_ratio
0        7880       0.999492          0.0
41       7140       0.981933          0.0
4        6803       0.996619          0.0
9        6727       0.996284          0.0
2        6621       0.999849          0.0


Across the 49 connected subgraphs analyzed, the average illicit ratio is an exceptionally high 98.5%, with the licit ratio remaining at 0%. This indicates that nearly every node across all subgraphs is being classified as illicit by the GAT model. The average subgraph contains about 4,158 nodes, with the largest one having 7,880 nodes and the smallest 1,089. The top five subgraphs with the highest illicit ratios are all above 99.9%, suggesting near-total classification confidence by the model in labeling nodes as illicit. Notably, these top subgraphs are also among the largest by size, further amplifying their influence on the model’s output distribution.

This extreme prediction skew implies that the GAT model has likely overfit to the illicit class, potentially due to training data imbalance or lack of expressive power to separate classes effectively. It may also reflect graph structural patterns where large components have highly correlated node labels. However, the complete absence of licit classifications is a red flag for model generalization and interpretability, and it calls for a deeper audit of the model’s training process, loss weighting strategy, and evaluation on balanced test data.

## Analysis for Sega Graph


In [31]:
results_df_sage = pipeline(data, sage_model, device=device)

# -------- Investigate Subgraph Patterns --------
print("\nSubgraph Statistics Summary:\n")
print("Total subgraphs:", len(results_df_sage))
print("Average illicit ratio:", results_df_sage['illicit_ratio'].mean())
print("Average licit ratio:", results_df_sage['licit_ratio'].mean())
print("Average subgraph size:", results_df_sage['num_nodes'].mean())

print("\nDistribution of subgraph sizes:")
print(results_df_sage['num_nodes'].describe())

print("\nTop 5 subgraphs with highest illicit ratio:")
print(results_df_sage.sort_values('illicit_ratio', ascending=False).head())

print("\nTop 5 subgraphs with largest size:")
print(results_df_sage.sort_values('num_nodes', ascending=False).head())


Subgraph Statistics Summary:

Total subgraphs: 49
Average illicit ratio: 0.9254265861926002
Average licit ratio: 0.0
Average subgraph size: 4158.551020408163

Distribution of subgraph sizes:
count      49.000000
mean     4158.551020
std      1592.470592
min      1089.000000
25%      2891.000000
50%      4291.000000
75%      5121.000000
max      7880.000000
Name: num_nodes, dtype: float64

Top 5 subgraphs with highest illicit ratio:
    num_nodes  illicit_ratio  licit_ratio
2        6621       0.994261          0.0
0        7880       0.994036          0.0
1        4544       0.989217          0.0
3        5693       0.988582          0.0
44       5598       0.987853          0.0

Top 5 subgraphs with largest size:
    num_nodes  illicit_ratio  licit_ratio
0        7880       0.994036          0.0
41       7140       0.935994          0.0
4        6803       0.982361          0.0
9        6727       0.931768          0.0
2        6621       0.994261          0.0


The analysis of the subgraph statistics reveals several notable patterns. Among the 49 connected subgraphs extracted from the graph, the average subgraph size is approximately 4,158 nodes, with the largest subgraph containing 7,880 nodes and the smallest having 1,089. Most strikingly, the average illicit ratio across all subgraphs is extremely high at approximately 92.5%, while the average licit ratio is effectively 0%. This indicates a strong imbalance in the model's output predictions, heavily skewed toward the "illicit" class. The top five subgraphs with the highest illicit ratios all exceed 98%, and they are also among the largest in terms of node count. This suggests that the largest components are not only dominant in size but are also classified almost entirely as illicit. Such a distribution might reflect the structure of the training data, potential class imbalance, or an overfitting tendency of the model to predict the illicit class. Further investigation is needed to verify whether this is due to real-world network patterns or modeling bias.