In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df_wallets_features = pd.read_csv("./dataSet/wallets_features.csv")
print("\nActors features: \n",df_wallets_features.head())

df_wallets_classes = pd.read_csv("./dataSet/wallets_classes.csv")
print("\nActors Classes: \n",df_wallets_classes.head())

df_AddrAddr_edgelist = pd.read_csv("./dataSet/AddrAddr_edgelist.csv")
print("\nAddress-Address edgelist: \n",df_AddrAddr_edgelist.head())

df_AddrTx_edgelist = pd.read_csv("./dataSet/AddrTx_edgelist.csv")
print("\nAddress-Transaction edgelist: \n",df_AddrTx_edgelist.head())

df_TxAddr_edgelist = pd.read_csv("./dataSet/TxAddr_edgelist.csv")
print("\nTransaction-Address edgelist: \n",df_TxAddr_edgelist.head())

print("\n")
df_classes = pd.read_csv("./dataSet/txs_classes.csv")
df_edges = pd.read_csv("./dataSet/txs_edgelist.csv")
df_features = pd.read_csv("./dataSet/txs_features.csv", header=None)
df_wallets_features_classes_combined = pd.read_csv("./dataSet/wallets_features_classes_combined.csv")


Actors features: 
                              address  Time step  num_txs_as_sender  \
0  111112TykSw72ztDN2WJger4cynzWYC5w         25                0.0   
1  1111DAYXhoxZx2tsRnzimfozo783x1yC2         25                0.0   
2  1111DAYXhoxZx2tsRnzimfozo783x1yC2         29                0.0   
3  1111DAYXhoxZx2tsRnzimfozo783x1yC2         39                0.0   
4  1111DAYXhoxZx2tsRnzimfozo783x1yC2         39                0.0   

   num_txs_as receiver  first_block_appeared_in  last_block_appeared_in  \
0                  1.0                 439586.0                439586.0   
1                  8.0                 439589.0                485959.0   
2                  8.0                 439589.0                485959.0   
3                  8.0                 439589.0                485959.0   
4                  8.0                 439589.0                485959.0   

   lifetime_in_blocks  total_txs  first_sent_block  first_received_block  ...  \
0                 0.0      

  df_features = pd.read_csv("./dataSet/txs_features.csv", header=None)


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import networkx as nx
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
# Merge class labels with features
df_wallets_classes.loc[df_wallets_classes['class'] == 'unknown', 'class'] = 3
df_class_feature = pd.merge(df_wallets_classes, df_wallets_features, on='address')

# Prepare data for Graph Construction (using wallet interactions from Address-Address edgelist)
# We'll only select wallets from the class that are not 'unknown'
selected_wallets = df_class_feature.loc[(df_class_feature['class'] != 3), 'address']
df_AddrAddr_edgelist_selected = df_AddrAddr_edgelist.loc[
    df_AddrAddr_edgelist['input_address'].isin(selected_wallets) &
    df_AddrAddr_edgelist['output_address'].isin(selected_wallets)
]

# Select wallet classes and features
df_wallets_classes_selected = df_wallets_classes.loc[df_wallets_classes['address'].isin(selected_wallets)]
df_wallets_features_selected = df_wallets_features.loc[df_wallets_features['address'].isin(selected_wallets)]
# Merge class and features for selected wallets
df_class_feature_selected = pd.merge(df_wallets_classes_selected, df_wallets_features_selected, on='address')

# Ensure class labels are correctly mapped (0 for licit, 1 for illicit)
df_class_feature_selected['class'] = df_class_feature_selected['class'].apply(lambda x: 0 if x == 2 else 1)



# Prepare feature matrix X and labels y as before
scaler = StandardScaler()
X = df_class_feature_selected.drop(columns=['address', 'class', 'Time step']).values
X = scaler.fit_transform(X)  # Normalize the feature columns

y = df_class_feature_selected['class'].values

# --- Apply SMOTE to handle class imbalance ---
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# --- Compute class weights for loss function ---
class_weights = compute_class_weight('balanced', classes=np.unique(y_resampled), y=y_resampled)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# --- Convert to PyTorch tensors ---
x = torch.tensor(X_resampled, dtype=torch.float)
y = torch.tensor(y_resampled, dtype=torch.long)

# --- Graph construction and data preparation ---
graph = nx.from_pandas_edgelist(df_AddrAddr_edgelist_selected, source='input_address', target='output_address', create_using=nx.DiGraph())

# Create a mapping from address to node index
address_to_index = {address: idx for idx, address in enumerate(df_class_feature_selected['address'].unique())}

# Map the addresses in the edges to their corresponding indices
edges_as_indices = [(address_to_index[row[0]], address_to_index[row[1]]) for row in graph.edges]

# Convert edge indices to PyTorch tensor
edge_index = torch.tensor(np.array(edges_as_indices).T, dtype=torch.long)

# Prepare data for PyTorch Geometric
data = Data(x=x, edge_index=edge_index, y=y)

In [5]:
data

Data(x=[677742, 55], edge_index=[2, 1092323], y=[677742])

In [6]:
# --- Split data into train and test ---
train_mask, test_mask = train_test_split(range(data.num_nodes), test_size=0.25, random_state=15)
data.train_mask = torch.tensor(train_mask, dtype=torch.long)
data.test_mask = torch.tensor(test_mask, dtype=torch.long)

# --- Define the Graph Neural Network (GNN) model ---
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GraphConv(input_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# --- Initialize model, optimizer, and loss function ---
input_dim = X_resampled.shape[1]
hidden_dim = 16
output_dim = 2  # Binary classification: 0 for licit, 1 for illicit

model = GNN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# --- Training function ---
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], weight=class_weights_tensor)
    loss.backward()
    optimizer.step()
    return loss.item()

# --- Test function ---
def test():
    model.eval()
    _, pred = model(data).max(dim=1)
    test_correct = pred[data.test_mask] == data.y[data.test_mask]
    test_acc = int(test_correct.sum()) / len(data.test_mask)
    return test_acc, pred

In [7]:
# --- Training loop ---
losses = []
accuracies = []
for epoch in range(1, 101):
    loss = train()
    losses.append(loss)
    if epoch % 10 == 0:
        acc, _ = test()
        accuracies.append(acc)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {acc:.4f}')

Epoch: 010, Loss: 0.5414, Test Acc: 0.7823
Epoch: 020, Loss: 0.4769, Test Acc: 0.8108
Epoch: 030, Loss: 0.4550, Test Acc: 0.8224
Epoch: 040, Loss: 0.4349, Test Acc: 0.8303
Epoch: 050, Loss: 0.4220, Test Acc: 0.8366
Epoch: 060, Loss: 0.4132, Test Acc: 0.8415
Epoch: 070, Loss: 0.4050, Test Acc: 0.8476
Epoch: 080, Loss: 0.3987, Test Acc: 0.8531
Epoch: 090, Loss: 0.3920, Test Acc: 0.8592
Epoch: 100, Loss: 0.3877, Test Acc: 0.8625


In [9]:
# --- Evaluate metrics ---
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def evaluate_metrics():
    model.eval()
    _, pred = model(data).max(dim=1)
    pred = pred[data.test_mask].detach().numpy()
    true_labels = data.y[data.test_mask].detach().numpy()

    accuracy = accuracy_score(true_labels, pred)
    precision = precision_score(true_labels, pred)
    recall = recall_score(true_labels, pred)
    f1 = f1_score(true_labels, pred)
    auc = roc_auc_score(true_labels, pred)
    cm = confusion_matrix(true_labels, pred)

    return accuracy, precision, recall, f1, auc, cm

accuracy, precision, recall, f1, auc, cm = evaluate_metrics()

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")

Accuracy: 0.8625
Precision: 0.8537
Recall: 0.8740
F1 Score: 0.8637
AUC Score: 0.8625
Confusion Matrix:
[[72316 12656]
 [10639 73825]]


In [10]:
pip install optuna


Note: you may need to restart the kernel to use updated packages.


In [11]:
import optuna
import torch.optim as optim

# Modify the GNN model class to accept variable hyperparameters
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate):
        super(GNN, self).__init__()
        self.conv1 = GraphConv(input_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, output_dim)
        self.dropout_rate = dropout_rate

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Define the objective function for hyperparameter tuning
def objective(trial):
    # Suggest hyperparameters using optuna
    hidden_dim = trial.suggest_int("hidden_dim", 8, 128)
    dropout_rate = trial.suggest_float("dropout_rate", 0.2, 0.6)
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-2)

    # Initialize model with the suggested hyperparameters
    model = GNN(input_dim=X_resampled.shape[1], hidden_dim=hidden_dim, output_dim=2, dropout_rate=dropout_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Training loop
    for epoch in range(1, 101):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], weight=class_weights_tensor)
        loss.backward()
        optimizer.step()

    # Evaluate on the test set
    accuracy, _ = test()

    # Return the accuracy (higher is better)
    return accuracy

# Perform hyperparameter tuning with optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Print the best hyperparameters found
print("Best hyperparameters: ", study.best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-12-09 07:25:11,545] A new study created in memory with name: no-name-a7f33b07-3b89-4e29-a954-8a8ca0925535
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-2)
[I 2024-12-09 07:29:59,782] Trial 0 finished with value: 0.8625144597370098 and parameters: {'hidden_dim': 114, 'dropout_rate': 0.4742848523027484, 'lr': 0.001129013407098167, 'weight_decay': 5.291107556865043e-05}. Best is trial 0 with value: 0.8625144597370098.
[I 2024-12-09 07:32:28,249] Trial 1 finished with value: 0.8625144597370098 and parameters: {'hidden_dim': 47, 'dropout_rate': 0.3498727412264184, 'lr': 0.007154787321428317, 'weight_decay': 0.0003776179421294593}. Best is trial 0 with value: 0.8625144597370098.
[I 2024-12-09 07:34:31,739] Trial 2 finished with value: 0.8625144597370098 and parameters: {'hidden_dim': 39, 'dropout_rate': 0.4851871918068645, 'lr': 0.0001132081062855218, 'weight_decay

Best hyperparameters:  {'hidden_dim': 114, 'dropout_rate': 0.4742848523027484, 'lr': 0.001129013407098167, 'weight_decay': 5.291107556865043e-05}
