In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import math

# Load data (adjust path as needed)
df = pd.read_csv('data.csv')

# Preprocess: Encode categoricals
categorical_cols = ['Payment_type', 'Sender_bank_location', 'Receiver_bank_location', 'Payment_currency', 'Received_currency', 'Laundering_type']
label_encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_cols}
for col, le in label_encoders.items():
    df[col] = le.transform(df[col])

# Map accounts to ids
all_accounts = sorted(set(df['Sender_account']) | set(df['Receiver_account']))
account_to_id = {acc: i for i, acc in enumerate(all_accounts)}
n = len(all_accounts)

# Derive account labels: 1 if involved in any laundering tx
laundering_accounts = set(df[df['Is_laundering'] == 1]['Sender_account']) | set(df[df['Is_laundering'] == 1]['Receiver_account'])
labels = torch.tensor([1 if acc in laundering_accounts else 0 for acc in all_accounts], dtype=torch.long)

# Build adjacency (sparse for large graphs) and edge features
# Edge features: [Amount, Payment_type, Sender_bank_location, Receiver_bank_location, Payment_currency, Received_currency, Laundering_type, Is_laundering] -> dim=8
from collections import defaultdict
edge_data = defaultdict(list)

for _, row in df.iterrows():
    i = account_to_id[row['Sender_account']]
    j = account_to_id[row['Receiver_account']]
    features = [row['Amount'], row['Payment_type'], row['Sender_bank_location'],
                row['Receiver_bank_location'], row['Payment_currency'], row['Received_currency'], row['Laundering_type'], row['Is_laundering']]
    edge_data[(i, j)].append(features)

# Aggregate edges: mean for numerical, mode for categorical (simplified: average all)
E = torch.zeros(n, n, 8)
adj = torch.zeros(n, n)

for (i, j), feats_list in edge_data.items():
    adj[i, j] = 1
    feats = np.mean(feats_list, axis=0)
    E[i, j] = torch.tensor(feats, dtype=torch.float)

# Normalize edge features
E = (E - E.mean(dim=(0,1), keepdim=True)) / (E.std(dim=(0,1), keepdim=True) + 1e-6)

# Initial node features: aggregated (total_out_amt, total_in_amt, num_out, num_in, avg_payment_type, etc.) -> dim=10
node_feat = torch.zeros(n, 10)
for (i, j), feats_list in edge_data.items():
    amounts = [f[0] for f in feats_list]
    num = len(amounts)
    total = sum(amounts)
    node_feat[i, 0] += total
    node_feat[i, 2] += num
    node_feat[j, 1] += total
    node_feat[j, 3] += num
    node_feat[i, 4] += np.mean([f[1] for f in feats_list]) * num
    node_feat[j, 5] += np.mean([f[1] for f in feats_list]) * num

node_feat = (node_feat - node_feat.mean(0)) / (node_feat.std(0) + 1e-6)

# For large graphs, convert to sparse if needed

# Graph Transformer Layer (updated for edge_dim=8)
class GTLayer(nn.Module):
    def __init__(self, d_model, nhead, edge_dim=8):
        super().__init__()
        self.nhead = nhead
        self.d_head = d_model // nhead
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.Wo = nn.Linear(d_model, d_model)
        self.bias_proj = nn.Linear(edge_dim, nhead)
        self.gate_proj = nn.Linear(edge_dim, d_model)
        self.ffn = nn.Sequential(nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Linear(d_model * 4, d_model))
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, h, E, adj):
        n = h.size(0)
        bias = self.bias_proj(E)
        gate = torch.sigmoid(self.gate_proj(E)).view(n, n, self.nhead, self.d_head)
        q = self.Wq(h).view(n, self.nhead, self.d_head)
        k = self.Wk(h).view(n, self.nhead, self.d_head)
        v = self.Wv(h).view(n, self.nhead, self.d_head)
        att_logit = torch.einsum('nhd,mhd->nhm', q, k) / math.sqrt(self.d_head)
        att_logit += bias.permute(0, 2, 1)
        mask = (adj == 0).unsqueeze(1)
        att_logit = att_logit.masked_fill(mask, float('-inf'))
        att = F.softmax(att_logit, dim=-1)
        out_head = torch.einsum('nhj,jhd,njhd->nhd', att, v, gate)
        out = out_head.contiguous().view(n, -1)
        out = self.Wo(out)
        h = self.norm1(h + out)
        ffn_out = self.ffn(h)
        h = self.norm2(h + ffn_out)
        return h

class SARPredictor(nn.Module):
    def __init__(self, node_dim=10, edge_dim=8, d_model=128, nhead=8, num_layers=3):
        super().__init__()
        self.node_embed = nn.Linear(node_dim, d_model)
        self.layers = nn.ModuleList([GTLayer(d_model, nhead, edge_dim) for _ in range(num_layers)])
        self.classifier = nn.Linear(d_model, 2)

    def forward(self, node_feat, E, adj):
        h = self.node_embed(node_feat)
        for layer in self.layers:
            h = layer(h, E, adj)
        logit = self.classifier(h)
        return logit

model = SARPredictor()

indices = np.arange(n)
train_idx, test_idx = train_test_split(indices, test_size=0.2, stratify=labels.numpy(), random_state=42)
train_mask = torch.zeros(n, dtype=torch.bool)
test_mask = torch.zeros(n, dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

weights = torch.tensor([1.0, (labels == 0).sum() / (labels == 1).sum()]) if (labels == 1).sum() > 0 else torch.tensor([1.0, 1.0])
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    logit = model(node_feat, E, adj)
    loss = criterion(logit[train_mask], labels[train_mask])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    logit = model(node_feat, E, adj)
    pred = logit.argmax(1)
acc = accuracy_score(labels[test_mask], pred[test_mask])
f1 = f1_score(labels[test_mask], pred[test_mask])
print(f'Test Accuracy: {acc:.4f}, F1-Score: {f1:.4f}')


KeyError: 'Payment_Type'