In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df_wallets_features = pd.read_csv("DL_Elliptic_Dataset/wallets_features.csv")
print("\nActors features: \n",df_wallets_features.head())

df_wallets_classes = pd.read_csv("DL_Elliptic_Dataset/wallets_classes.csv")
print("\nActors Classes: \n",df_wallets_classes.head())

df_AddrAddr_edgelist = pd.read_csv("DL_Elliptic_Dataset/AddrAddr_edgelist.csv")
print("\nAddress-Address edgelist: \n",df_AddrAddr_edgelist.head())

df_AddrTx_edgelist = pd.read_csv("DL_Elliptic_Dataset/AddrTx_edgelist.csv")
print("\nAddress-Transaction edgelist: \n",df_AddrTx_edgelist.head())

df_TxAddr_edgelist = pd.read_csv("DL_Elliptic_Dataset/TxAddr_edgelist.csv")
print("\nTransaction-Address edgelist: \n",df_TxAddr_edgelist.head())

print("\n")
df_classes = pd.read_csv("DL_Elliptic_Dataset/txs_classes.csv")
df_edges = pd.read_csv("DL_Elliptic_Dataset/txs_edgelist.csv")
df_features = pd.read_csv("DL_Elliptic_Dataset/txs_features.csv", header=None)
df_wallets_features_classes_combined = pd.read_csv("DL_Elliptic_Dataset/wallets_features_classes_combined.csv")


Actors features: 
                              address  Time step  num_txs_as_sender  \
0  111112TykSw72ztDN2WJger4cynzWYC5w         25                0.0   
1  1111DAYXhoxZx2tsRnzimfozo783x1yC2         25                0.0   
2  1111DAYXhoxZx2tsRnzimfozo783x1yC2         29                0.0   
3  1111DAYXhoxZx2tsRnzimfozo783x1yC2         39                0.0   
4  1111DAYXhoxZx2tsRnzimfozo783x1yC2         39                0.0   

   num_txs_as receiver  first_block_appeared_in  last_block_appeared_in  \
0                  1.0                 439586.0                439586.0   
1                  8.0                 439589.0                485959.0   
2                  8.0                 439589.0                485959.0   
3                  8.0                 439589.0                485959.0   
4                  8.0                 439589.0                485959.0   

   lifetime_in_blocks  total_txs  first_sent_block  first_received_block  ...  \
0                 0.0      

  df_features = pd.read_csv("DL_Elliptic_Dataset/txs_features.csv", header=None)


In [35]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import networkx as nx
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

# Merge class labels with features
df_wallets_classes.loc[df_wallets_classes['class'] == 'unknown', 'class'] = 3
df_class_feature = pd.merge(df_wallets_classes, df_wallets_features, on='address')

# Prepare data for Graph Construction (using wallet interactions from Address-Address edgelist)
# We'll only select wallets from the class that are not 'unknown'
selected_wallets = df_class_feature.loc[(df_class_feature['class'] != 3), 'address']
df_AddrAddr_edgelist_selected = df_AddrAddr_edgelist.loc[
    df_AddrAddr_edgelist['input_address'].isin(selected_wallets) &
    df_AddrAddr_edgelist['output_address'].isin(selected_wallets)
]

# Select wallet classes and features
df_wallets_classes_selected = df_wallets_classes.loc[df_wallets_classes['address'].isin(selected_wallets)]
df_wallets_features_selected = df_wallets_features.loc[df_wallets_features['address'].isin(selected_wallets)]
# Merge class and features for selected wallets
df_class_feature_selected = pd.merge(df_wallets_classes_selected, df_wallets_features_selected, on='address')

# Ensure class labels are correctly mapped (0 for licit, 1 for illicit)
df_class_feature_selected['class'] = df_class_feature_selected['class'].apply(lambda x: 0 if x == 2 else 1)

# Feature normalization using StandardScaler
scaler = StandardScaler()
X = df_class_feature_selected.drop(columns=['address', 'class', 'Time step']).values
X = scaler.fit_transform(X)  # Normalize the feature columns

# Extract labels (y)
y = df_class_feature_selected['class'].values

# Compute class weights for imbalanced classes
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# Convert to PyTorch tensors
x = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.long)

# Prepare edge list for graph (convert addresses to node indices)
graph = nx.from_pandas_edgelist(df_AddrAddr_edgelist_selected, source='input_address', target='output_address', create_using=nx.DiGraph())

# Create a mapping from address to node index
address_to_index = {address: idx for idx, address in enumerate(df_class_feature_selected['address'].unique())}

# Map the addresses in the edges to their corresponding indices
edges_as_indices = [(address_to_index[row[0]], address_to_index[row[1]]) for row in graph.edges]

# Convert edge indices to PyTorch tensor
edge_index = torch.tensor(np.array(edges_as_indices).T, dtype=torch.long)

# Prepare data for PyTorch Geometric
data = Data(x=x, edge_index=edge_index, y=y)

In [36]:

# Split data into train and test
train_mask, test_mask = train_test_split(range(data.num_nodes), test_size=0.25, random_state=15)
data.train_mask = torch.tensor(train_mask, dtype=torch.long)
data.test_mask = torch.tensor(test_mask, dtype=torch.long)

# Define the Graph Neural Network (GNN) model
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GraphConv(input_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
input_dim = X.shape[1]
hidden_dim = 16
output_dim = 2  # Binary classification: 0 for licit, 1 for illicit

model = GNN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask], weight=class_weights_tensor)
    loss.backward()
    optimizer.step()
    return loss.item()

# Test function
def test():
    model.eval()
    _, pred = model(data).max(dim=1)
    test_correct = pred[data.test_mask] == data.y[data.test_mask]
    test_acc = int(test_correct.sum()) / len(data.test_mask)
    return test_acc, pred

In [37]:

# Training loop
losses = []
accuracies = []
for epoch in range(1, 101):
    loss = train()
    losses.append(loss)
    if epoch % 10 == 0:
        acc, _ = test()
        accuracies.append(acc)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {acc:.4f}')

Epoch: 010, Loss: 1.1956, Test Acc: 0.5744
Epoch: 020, Loss: 0.6215, Test Acc: 0.7505
Epoch: 030, Loss: 0.5645, Test Acc: 0.7937
Epoch: 040, Loss: 0.5373, Test Acc: 0.8143
Epoch: 050, Loss: 0.5203, Test Acc: 0.8255
Epoch: 060, Loss: 0.5055, Test Acc: 0.8286
Epoch: 070, Loss: 0.4991, Test Acc: 0.8344
Epoch: 080, Loss: 0.4902, Test Acc: 0.8349
Epoch: 090, Loss: 0.4831, Test Acc: 0.8342
Epoch: 100, Loss: 0.4788, Test Acc: 0.8376


In [38]:

# Evaluate metrics: accuracy, precision, recall, F1, confusion matrix
def evaluate_metrics():
    model.eval()
    _, pred = model(data).max(dim=1)
    pred = pred[data.test_mask].detach().numpy()
    true_labels = data.y[data.test_mask].detach().numpy()

    accuracy = accuracy_score(true_labels, pred)
    precision = precision_score(true_labels, pred)
    recall = recall_score(true_labels, pred)
    f1 = f1_score(true_labels, pred)
    auc = roc_auc_score(true_labels, pred)
    cm = confusion_matrix(true_labels, pred)

    return accuracy, precision, recall, f1, auc, cm

# Get evaluation metrics
accuracy, precision, recall, f1, auc, cm = evaluate_metrics()

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")


Accuracy: 0.8376
Precision: 0.2960
Recall: 0.7757
F1 Score: 0.4285
AUC Score: 0.8093
Confusion Matrix:
[[71353 13305]
 [ 1617  5593]]
