In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df_wallets_features = pd.read_csv("DL_Elliptic_Dataset/wallets_features.csv")
print("\nActors features: \n",df_wallets_features.head())

df_wallets_classes = pd.read_csv("DL_Elliptic_Dataset/wallets_classes.csv")
print("\nActors Classes: \n",df_wallets_classes.head())

df_AddrAddr_edgelist = pd.read_csv("DL_Elliptic_Dataset/AddrAddr_edgelist.csv")
print("\nAddress-Address edgelist: \n",df_AddrAddr_edgelist.head())

df_AddrTx_edgelist = pd.read_csv("DL_Elliptic_Dataset/AddrTx_edgelist.csv")
print("\nAddress-Transaction edgelist: \n",df_AddrTx_edgelist.head())

df_TxAddr_edgelist = pd.read_csv("DL_Elliptic_Dataset/TxAddr_edgelist.csv")
print("\nTransaction-Address edgelist: \n",df_TxAddr_edgelist.head())

print("\n")
df_classes = pd.read_csv("DL_Elliptic_Dataset/txs_classes.csv")
df_edges = pd.read_csv("DL_Elliptic_Dataset/txs_edgelist.csv")
df_features = pd.read_csv("DL_Elliptic_Dataset/txs_features.csv", header=None)
df_wallets_features_classes_combined = pd.read_csv("DL_Elliptic_Dataset/wallets_features_classes_combined.csv")


Actors features: 
                              address  Time step  num_txs_as_sender  \
0  111112TykSw72ztDN2WJger4cynzWYC5w         25                0.0   
1  1111DAYXhoxZx2tsRnzimfozo783x1yC2         25                0.0   
2  1111DAYXhoxZx2tsRnzimfozo783x1yC2         29                0.0   
3  1111DAYXhoxZx2tsRnzimfozo783x1yC2         39                0.0   
4  1111DAYXhoxZx2tsRnzimfozo783x1yC2         39                0.0   

   num_txs_as receiver  first_block_appeared_in  last_block_appeared_in  \
0                  1.0                 439586.0                439586.0   
1                  8.0                 439589.0                485959.0   
2                  8.0                 439589.0                485959.0   
3                  8.0                 439589.0                485959.0   
4                  8.0                 439589.0                485959.0   

   lifetime_in_blocks  total_txs  first_sent_block  first_received_block  ...  \
0                 0.0      

  df_features = pd.read_csv("DL_Elliptic_Dataset/txs_features.csv", header=None)


In [2]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import networkx as nx
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

# Merge class labels with features
df_wallets_classes.loc[df_wallets_classes['class'] == 'unknown', 'class'] = 3
df_class_feature = pd.merge(df_wallets_classes, df_wallets_features, on='address')

# Prepare data for Graph Construction (using wallet interactions from Address-Address edgelist)
# We'll only select wallets from the class that are not 'unknown'
selected_wallets = df_class_feature.loc[(df_class_feature['class'] != 3), 'address']
df_AddrAddr_edgelist_selected = df_AddrAddr_edgelist.loc[
    df_AddrAddr_edgelist['input_address'].isin(selected_wallets) &
    df_AddrAddr_edgelist['output_address'].isin(selected_wallets)
]

# Select wallet classes and features
df_wallets_classes_selected = df_wallets_classes.loc[df_wallets_classes['address'].isin(selected_wallets)]
df_wallets_features_selected = df_wallets_features.loc[df_wallets_features['address'].isin(selected_wallets)]
# Merge class and features for selected wallets
df_class_feature_selected = pd.merge(df_wallets_classes_selected, df_wallets_features_selected, on='address')

# Ensure class labels are correctly mapped (0 for licit, 1 for illicit)
df_class_feature_selected['class'] = df_class_feature_selected['class'].apply(lambda x: 0 if x == 2 else 1)

# --- Load and preprocess the dataset ---
# Prepare feature matrix X and labels y as before
scaler = StandardScaler()
X = df_class_feature_selected.drop(columns=['address', 'class', 'Time step']).values
X = scaler.fit_transform(X)  # Normalize the feature columns

y = df_class_feature_selected['class'].values

# --- Apply SMOTE to handle class imbalance ---
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# --- Compute class weights for loss function ---
class_weights = compute_class_weight('balanced', classes=np.unique(y_resampled), y=y_resampled)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# --- Convert to PyTorch tensors ---
x = torch.tensor(X_resampled, dtype=torch.float)
y = torch.tensor(y_resampled, dtype=torch.long)

# --- Graph construction and data preparation ---
graph = nx.from_pandas_edgelist(df_AddrAddr_edgelist_selected, source='input_address', target='output_address', create_using=nx.DiGraph())

# Create a mapping from address to node index
address_to_index = {address: idx for idx, address in enumerate(df_class_feature_selected['address'].unique())}

# Map the addresses in the edges to their corresponding indices
edges_as_indices = [(address_to_index[row[0]], address_to_index[row[1]]) for row in graph.edges]

# Convert edge indices to PyTorch tensor
edge_index = torch.tensor(np.array(edges_as_indices).T, dtype=torch.long)

# --- Prepare data for PyTorch Geometric ---
data = Data(x=x, edge_index=edge_index, y=y)

In [3]:

# --- Split data into train and test ---
train_mask, test_mask = train_test_split(range(data.num_nodes), test_size=0.25, random_state=15)
data.train_mask = torch.tensor(train_mask, dtype=torch.long)
data.test_mask = torch.tensor(test_mask, dtype=torch.long)

# --- Define the LSTM model ---
class LSTM_Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super(LSTM_Model, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Define LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Assuming x has shape (batch_size, seq_length, input_dim)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)

        # Forward pass through LSTM layer
        out, _ = self.lstm(x, (h0, c0))

        # We take the output of the last time step (batch_size, hidden_dim)
        out = out[:, -1, :]

        # Pass through the fully connected layer
        out = self.fc(out)
        return out

# --- Initialize the LSTM model, optimizer, and loss function ---
input_dim = X_resampled.shape[1]
hidden_dim = 128
output_dim = 2  # Binary classification: 0 for licit, 1 for illicit

model = LSTM_Model(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# --- Training function ---
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x.unsqueeze(1))  # Unsqueeze to add sequence dimension (batch_size, seq_len, input_dim)
    loss = nn.CrossEntropyLoss()(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# --- Test function ---
def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x.unsqueeze(1))
        _, pred = out.max(dim=1)
        test_correct = pred[data.test_mask] == data.y[data.test_mask]
        test_acc = int(test_correct.sum()) / len(data.test_mask)
    return test_acc, pred

In [4]:

# --- Training loop ---
losses = []
accuracies = []
for epoch in range(1, 101):
    loss = train()
    losses.append(loss)
    if epoch % 10 == 0:
        acc, _ = test()
        accuracies.append(acc)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {acc:.4f}')

Epoch: 010, Loss: 0.5604, Test Acc: 0.7376
Epoch: 020, Loss: 0.4517, Test Acc: 0.8023
Epoch: 030, Loss: 0.4282, Test Acc: 0.8141
Epoch: 040, Loss: 0.4101, Test Acc: 0.8111
Epoch: 050, Loss: 0.3886, Test Acc: 0.8191
Epoch: 060, Loss: 0.3757, Test Acc: 0.8273
Epoch: 070, Loss: 0.3673, Test Acc: 0.8328
Epoch: 080, Loss: 0.3585, Test Acc: 0.8424
Epoch: 090, Loss: 0.3512, Test Acc: 0.8493
Epoch: 100, Loss: 0.3461, Test Acc: 0.8526


In [5]:

# --- Evaluate metrics: accuracy, precision, recall, F1, confusion matrix ---
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def evaluate_metrics():
    model.eval()
    with torch.no_grad():
        out = model(data.x.unsqueeze(1))
        _, pred = out.max(dim=1)
        pred = pred[data.test_mask].detach().numpy()
        true_labels = data.y[data.test_mask].detach().numpy()

        accuracy = accuracy_score(true_labels, pred)
        precision = precision_score(true_labels, pred)
        recall = recall_score(true_labels, pred)
        f1 = f1_score(true_labels, pred)
        auc = roc_auc_score(true_labels, pred)
        cm = confusion_matrix(true_labels, pred)

    return accuracy, precision, recall, f1, auc, cm

# Get evaluation metrics
accuracy, precision, recall, f1, auc, cm = evaluate_metrics()

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC Score: {auc:.4f}")
print(f"Confusion Matrix:\n{cm}")


Accuracy: 0.8526
Precision: 0.8293
Recall: 0.8870
F1 Score: 0.8572
AUC Score: 0.8527
Confusion Matrix:
[[69551 15421]
 [ 9548 74916]]
