# Anti Money Laundering Detection

In [2]:
from typing import Callable, Optional
import pandas as pd
from sklearn import preprocessing
import numpy as np
import torch

from torch_geometric.data import (
    Data,
    InMemoryDataset
)

pd.set_option('display.max_columns', None)
path = 'HI-Small_Trans.csv'
df = pd.read_csv(path)

# Data Preprocessing

In [3]:
print(df.head())

          Timestamp  From Bank    Account  To Bank  Account.1  \
0  2022/09/01 00:20         10  8000EBD30       10  8000EBD30   
1  2022/09/01 00:20       3208  8000F4580        1  8000F5340   
2  2022/09/01 00:00       3209  8000F4670     3209  8000F4670   
3  2022/09/01 00:02         12  8000F5030       12  8000F5030   
4  2022/09/01 00:06         10  8000F5200       10  8000F5200   

   Amount Received Receiving Currency  Amount Paid Payment Currency  \
0          3697.34          US Dollar      3697.34        US Dollar   
1             0.01          US Dollar         0.01        US Dollar   
2         14675.57          US Dollar     14675.57        US Dollar   
3          2806.97          US Dollar      2806.97        US Dollar   
4         36682.97          US Dollar     36682.97        US Dollar   

  Payment Format  Is Laundering  
0   Reinvestment              0  
1         Cheque              0  
2   Reinvestment              0  
3   Reinvestment              0  
4   Reinvest

In [4]:
print(df.dtypes)

Timestamp              object
From Bank               int64
Account                object
To Bank                 int64
Account.1              object
Amount Received       float64
Receiving Currency     object
Amount Paid           float64
Payment Currency       object
Payment Format         object
Is Laundering           int64
dtype: object


In [5]:
print(df.isnull().sum())

Timestamp             0
From Bank             0
Account               0
To Bank               0
Account.1             0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
dtype: int64


In [6]:
def df_label_encoder(df, columns):
        le = preprocessing.LabelEncoder()
        for i in columns:
            df[i] = le.fit_transform(df[i].astype(str))
        return df

def preprocess(df):
        # Label the 'Payment Format', 'Payment Currency', 'Receiving Currency' by classes with sklearn LabelEncoder
        df = df_label_encoder(df,['Payment Format', 'Payment Currency', 'Receiving Currency'])
        # Transform the Timestamp with min max normalization.  
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df['Timestamp'] = df['Timestamp'].apply(lambda x: x.value)
        df['Timestamp'] = (df['Timestamp']-df['Timestamp'].min())/(df['Timestamp'].max()-df['Timestamp'].min())
        # Create unique ID for each account by adding bank code with account number.  
        df['Account'] = df['From Bank'].astype(str) + '_' + df['Account']
        df['Account.1'] = df['To Bank'].astype(str) + '_' + df['Account.1']
        df = df.sort_values(by=['Account'])
        # Create receiving_df with the information of receiving accounts, received amount and currency
        receiving_df = df[['Account.1', 'Amount Received', 'Receiving Currency']]
        # Create paying_df with the information of payer accounts, paid amount and currency
        paying_df = df[['Account', 'Amount Paid', 'Payment Currency']]
        receiving_df = receiving_df.rename({'Account.1': 'Account'}, axis=1)
        # Create a list of currency used among all transactions
        currency_ls = sorted(df['Receiving Currency'].unique())

        return df, receiving_df, paying_df, currency_ls

In [7]:
df, receiving_df, paying_df, currency_ls = preprocess(df = df)
print(df.head())

         Timestamp  From Bank          Account  To Bank        Account.1  \
4278714   0.456320      10057  10057_803A115E0    29467  29467_803E020C0   
2798190   0.285018      10057  10057_803A115E0    29467  29467_803E020C0   
2798191   0.284233      10057  10057_803A115E0    29467  29467_803E020C0   
3918769   0.417079      10057  10057_803A115E0    29467  29467_803E020C0   
213094    0.000746      10057  10057_803A115E0    10057  10057_803A115E0   

         Amount Received  Receiving Currency  Amount Paid  Payment Currency  \
4278714        787197.11                  13    787197.11                13   
2798190        787197.11                  13    787197.11                13   
2798191        681262.19                  13    681262.19                13   
3918769        681262.19                  13    681262.19                13   
213094         146954.27                  13    146954.27                13   

         Payment Format  Is Laundering  
4278714               3    

In [8]:
print(receiving_df.head())
print(paying_df.head())

                 Account  Amount Received  Receiving Currency
4278714  29467_803E020C0        787197.11                  13
2798190  29467_803E020C0        787197.11                  13
2798191  29467_803E020C0        681262.19                  13
3918769  29467_803E020C0        681262.19                  13
213094   10057_803A115E0        146954.27                  13
                 Account  Amount Paid  Payment Currency
4278714  10057_803A115E0    787197.11                13
2798190  10057_803A115E0    787197.11                13
2798191  10057_803A115E0    681262.19                13
3918769  10057_803A115E0    681262.19                13
213094   10057_803A115E0    146954.27                13


In [9]:
print(currency_ls)

[np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14)]


In [10]:
# We would like to extract all unique accounts from payer and receiver as node of our graph. It includes the unique account ID, Bank code and the label of 'Is Laundering'.  
# In this section, we consider both payer and receiver involved in a illicit transaction as suspicious accounts, we will label both accounts with 'Is Laundering' == 1.
def get_all_account(df):
        ldf = df[['Account', 'From Bank']]
        rdf = df[['Account.1', 'To Bank']]
        suspicious = df[df['Is Laundering']==1]
        s1 = suspicious[['Account', 'Is Laundering']]
        s2 = suspicious[['Account.1', 'Is Laundering']]
        s2 = s2.rename({'Account.1': 'Account'}, axis=1)
        suspicious = pd.concat([s1, s2], join='outer')
        suspicious = suspicious.drop_duplicates()

        ldf = ldf.rename({'From Bank': 'Bank'}, axis=1)
        rdf = rdf.rename({'Account.1': 'Account', 'To Bank': 'Bank'}, axis=1)
        df = pd.concat([ldf, rdf], join='outer')
        df = df.drop_duplicates()

        df['Is Laundering'] = 0
        df.set_index('Account', inplace=True)
        df.update(suspicious.set_index('Account'))
        df = df.reset_index()
        return df

In [11]:
accounts = get_all_account(df)
print(accounts.head())

           Account   Bank  Is Laundering
0  10057_803A115E0  10057              0
1  10057_803AA8E90  10057              0
2  10057_803AAB430  10057              0
3  10057_803AACE20  10057              0
4  10057_803AB4F70  10057              0


# Create Node

In [12]:
# For node features, we would like to aggregate the mean of paid and received amount with different types of currency as the new features of each node. 
def paid_currency_aggregate(currency_ls, paying_df, accounts):
        for i in currency_ls:
            temp = paying_df[paying_df['Payment Currency'] == i]
            accounts['avg paid '+str(i)] = temp['Amount Paid'].groupby(temp['Account']).transform('mean')
        return accounts

def received_currency_aggregate(currency_ls, receiving_df, accounts):
    for i in currency_ls:
        temp = receiving_df[receiving_df['Receiving Currency'] == i]
        accounts['avg received '+str(i)] = temp['Amount Received'].groupby(temp['Account']).transform('mean')
    accounts = accounts.fillna(0)
    return accounts
# Now we can define the node attributes by the bank code and the mean of paid and received amount with different types of currency.
def get_node_attr(currency_ls, paying_df,receiving_df, accounts):
        node_df = paid_currency_aggregate(currency_ls, paying_df, accounts)
        node_df = received_currency_aggregate(currency_ls, receiving_df, node_df)
        node_label = torch.from_numpy(node_df['Is Laundering'].values).to(torch.float)
        node_df = node_df.drop(['Account', 'Is Laundering'], axis=1)
        node_df = df_label_encoder(node_df,['Bank'])
        return node_df, node_label

In [13]:
node_df, node_label = get_node_attr(currency_ls, paying_df,receiving_df, accounts)
print(node_df.head())

   Bank  avg paid 0  avg paid 1  avg paid 2  avg paid 3  avg paid 4  \
0     2         0.0         0.0         0.0         0.0         0.0   
1     2         0.0         0.0         0.0         0.0         0.0   
2     2         0.0         0.0         0.0         0.0         0.0   
3     2         0.0         0.0         0.0         0.0         0.0   
4     2         0.0         0.0         0.0         0.0         0.0   

   avg paid 5  avg paid 6  avg paid 7  avg paid 8  avg paid 9  avg paid 10  \
0         0.0         0.0         0.0         0.0         0.0          0.0   
1         0.0         0.0         0.0         0.0         0.0          0.0   
2         0.0         0.0         0.0         0.0         0.0          0.0   
3         0.0         0.0         0.0         0.0         0.0          0.0   
4         0.0         0.0         0.0         0.0         0.0          0.0   

   avg paid 11   avg paid 12  avg paid 13  avg paid 14  avg received 0  \
0          0.0   1922.000000  

# Create Edge



In [14]:
# In terms of edge features, we would like to conside each transcation as edges.  
# For edge index, we replace all account with index and stack into a list with size of [2, num of transcation]  
# For edge attributes, we used 'Timestamp', 'Amount Received', 'Receiving Currency', 'Amount Paid', 'Payment Currency' and 'Payment Format'

def get_edge_df(accounts, df):
        accounts = accounts.reset_index(drop=True)
        accounts['ID'] = accounts.index
        mapping_dict = dict(zip(accounts['Account'], accounts['ID']))
        df['From'] = df['Account'].map(mapping_dict)
        df['To'] = df['Account.1'].map(mapping_dict)
        df = df.drop(['Account', 'Account.1', 'From Bank', 'To Bank'], axis=1)

        edge_index = torch.stack([torch.from_numpy(df['From'].values), torch.from_numpy(df['To'].values)], dim=0)

        df = df.drop(['Is Laundering', 'From', 'To'], axis=1)
        edge_attr = df  # for visualization
        return edge_attr, edge_index

In [15]:
edge_attr, edge_index = get_edge_df(accounts, df)
print(edge_attr.head())

         Timestamp  Amount Received  Receiving Currency  Amount Paid  \
4278714   0.456320        787197.11                  13    787197.11   
2798190   0.285018        787197.11                  13    787197.11   
2798191   0.284233        681262.19                  13    681262.19   
3918769   0.417079        681262.19                  13    681262.19   
213094    0.000746        146954.27                  13    146954.27   

         Payment Currency  Payment Format  
4278714                13               3  
2798190                13               3  
2798191                13               4  
3918769                13               4  
213094                 13               5  


In [16]:
print(edge_index)

tensor([[     0,      0,      0,  ..., 496997, 496997, 496998],
        [299458, 299458, 299458,  ..., 496997, 496997, 496998]])


# Model Architecture

# EvolveGCN

In [17]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv

class EvolveGCNH(nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels):
        super(EvolveGCNH, self).__init__()
        self.in_channels = in_channels
        self.hidden_channels = hidden_channels
        self.rnn = nn.GRUCell(in_channels, hidden_channels)
        self.gcn = GCNConv(hidden_channels, out_channels)
        self.hidden_state = None

    def forward(self, x, edge_index, edge_attr=None):
        if self.hidden_state is None or self.hidden_state.size(0) != x.size(0):
            self.hidden_state = x.new_zeros(x.size(0), self.hidden_channels)
        self.hidden_state = self.hidden_state.detach()
        self.hidden_state = self.rnn(x, self.hidden_state)
        x = self.gcn(self.hidden_state, edge_index)
        return x



# PyG InMemoryDataset

In [18]:
class AMLtoGraph(InMemoryDataset):

    def __init__(self, root: str, edge_window_size: int = 10,
                 transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None):
        self.edge_window_size = edge_window_size
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self) -> str:
        return 'HI-Small_Trans.csv'

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    @property
    def num_nodes(self) -> int:
        return self._data.edge_index.max().item() + 1

    def df_label_encoder(self, df, columns):
        le = preprocessing.LabelEncoder()
        for i in columns:
            df[i] = le.fit_transform(df[i].astype(str))
        return df


    def preprocess(self, df):
        df = self.df_label_encoder(df,['Payment Format', 'Payment Currency', 'Receiving Currency'])
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df['Timestamp'] = df['Timestamp'].apply(lambda x: x.value)
        df['Timestamp'] = (df['Timestamp']-df['Timestamp'].min())/(df['Timestamp'].max()-df['Timestamp'].min())

        df['Account'] = df['From Bank'].astype(str) + '_' + df['Account']
        df['Account.1'] = df['To Bank'].astype(str) + '_' + df['Account.1']
        df = df.sort_values(by=['Account'])
        receiving_df = df[['Account.1', 'Amount Received', 'Receiving Currency']]
        paying_df = df[['Account', 'Amount Paid', 'Payment Currency']]
        receiving_df = receiving_df.rename({'Account.1': 'Account'}, axis=1)
        currency_ls = sorted(df['Receiving Currency'].unique())

        return df, receiving_df, paying_df, currency_ls

    def get_all_account(self, df):
        ldf = df[['Account', 'From Bank']]
        rdf = df[['Account.1', 'To Bank']]
        suspicious = df[df['Is Laundering']==1]
        s1 = suspicious[['Account', 'Is Laundering']]
        s2 = suspicious[['Account.1', 'Is Laundering']]
        s2 = s2.rename({'Account.1': 'Account'}, axis=1)
        suspicious = pd.concat([s1, s2], join='outer')
        suspicious = suspicious.drop_duplicates()

        ldf = ldf.rename({'From Bank': 'Bank'}, axis=1)
        rdf = rdf.rename({'Account.1': 'Account', 'To Bank': 'Bank'}, axis=1)
        df = pd.concat([ldf, rdf], join='outer')
        df = df.drop_duplicates()

        df['Is Laundering'] = 0
        df.set_index('Account', inplace=True)
        df.update(suspicious.set_index('Account'))
        df = df.reset_index()
        return df
    
    def paid_currency_aggregate(self, currency_ls, paying_df, accounts):
        for i in currency_ls:
            temp = paying_df[paying_df['Payment Currency'] == i]
            accounts['avg paid '+str(i)] = temp['Amount Paid'].groupby(temp['Account']).transform('mean')
        return accounts

    def received_currency_aggregate(self, currency_ls, receiving_df, accounts):
        for i in currency_ls:
            temp = receiving_df[receiving_df['Receiving Currency'] == i]
            accounts['avg received '+str(i)] = temp['Amount Received'].groupby(temp['Account']).transform('mean')
        accounts = accounts.fillna(0)
        return accounts

    def get_edge_df(self, accounts, df):
        accounts = accounts.reset_index(drop=True)
        accounts['ID'] = accounts.index
        mapping_dict = dict(zip(accounts['Account'], accounts['ID']))
        
        df['From'] = df['Account'].map(mapping_dict)
        df['To'] = df['Account.1'].map(mapping_dict)

        edge_index = torch.tensor(
            [df['From'].values, df['To'].values],
            dtype=torch.long
        )
        
        df = df.drop(['From', 'To', 'Account', 'Account.1'], axis=1)
        edge_attr = torch.tensor(df.values, dtype=torch.float)

        return edge_attr, edge_index


    def get_node_attr(self, currency_ls, paying_df,receiving_df, accounts):
        node_df = self.paid_currency_aggregate(currency_ls, paying_df, accounts)
        node_df = self.received_currency_aggregate(currency_ls, receiving_df, node_df)
        node_label = torch.from_numpy(node_df['Is Laundering'].values).to(torch.float)
        node_df = node_df.drop(['Account', 'Is Laundering'], axis=1)
        node_df = self.df_label_encoder(node_df,['Bank'])
        node_df = torch.from_numpy(node_df.values).to(torch.float)
        return node_df, node_label

    def process(self):
        df = pd.read_csv(self.raw_paths[0])
        df, receiving_df, paying_df, currency_ls = self.preprocess(df)
        accounts = self.get_all_account(df)
        node_attr, node_label = self.get_node_attr(currency_ls, paying_df, receiving_df, accounts)
        edge_attr, edge_index = self.get_edge_df(accounts, df)

        data = Data(
            x=node_attr,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=node_label
        )

        # Add train/val/test masks
        num_nodes = data.x.size(0)
        data.train_mask = torch.rand(num_nodes) < 0.8
        data.val_mask = ~data.train_mask

        # Save processed data
        data_list = [data]
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])



# Model Training 

In [19]:
import torch
import torch.nn.functional as F
from torch_geometric.loader import NeighborLoader
from torch_geometric.transforms import RandomNodeSplit

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
dataset = AMLtoGraph('')
data = dataset[0]
data = RandomNodeSplit(split='train_rest', num_val=0.1)(data)

# Data loaders
train_loader = NeighborLoader(
    data,
    num_neighbors=[30] * 2,
    batch_size=256,
    input_nodes=data.train_mask,
)

val_loader = NeighborLoader(
    data,
    num_neighbors=[30] * 2,
    batch_size=256,
    input_nodes=data.val_mask,
)

model = EvolveGCNH(
    in_channels=data.num_features,  # Number of input features per node
    out_channels=1,                 # Binary classification (1 output feature)
    hidden_channels=16              # Number of hidden units
).to(device)


criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
epochs = 20

for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch.to(device)
        pred = model(batch.x, batch.edge_index)
        pred = torch.sigmoid(pred)
        loss = criterion(pred, batch.y.unsqueeze(1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


    if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                batch.to(device)
                pred = model(batch.x, batch.edge_index, batch.edge_attr)
                pred = torch.sigmoid(pred).squeeze()
                pred_labels = (pred > 0.5).float()
                ground_truth = batch.y

                correct += (pred_labels == ground_truth).sum().item()
                total += len(ground_truth)

        accuracy = correct / total
        print(f"Epoch {epoch + 1:02d}/{epochs}, Loss: {total_loss:.4f}, Val Accuracy: {accuracy:.4f}")

  self.data, self.slices = torch.load(self.processed_paths[0])


Epoch 1, Loss: 723.1804
Epoch 2, Loss: 434.7213
Epoch 3, Loss: 313.7039
Epoch 4, Loss: 247.0510
Epoch 5, Loss: 203.8094
Epoch 05/20, Loss: 203.8094, Val Accuracy: 0.9709
Epoch 6, Loss: 172.6424
Epoch 7, Loss: 153.6973
Epoch 8, Loss: 139.7013
Epoch 9, Loss: 129.3076
Epoch 10, Loss: 117.6553
Epoch 10/20, Loss: 117.6553, Val Accuracy: 0.9927
Epoch 11, Loss: 110.6357
Epoch 12, Loss: 105.5695
Epoch 13, Loss: 101.1098
Epoch 14, Loss: 97.1739
Epoch 15, Loss: 93.3803
Epoch 15/20, Loss: 93.3803, Val Accuracy: 0.9928
Epoch 16, Loss: 90.2055
Epoch 17, Loss: 87.2068
Epoch 18, Loss: 84.3601
Epoch 19, Loss: 81.7867
Epoch 20, Loss: 79.4491
Epoch 20/20, Loss: 79.4491, Val Accuracy: 0.9928
