In [1]:
import pandas as pd

df = pd.read_csv("data.csv")  # Path to your downloaded SAML-D dataset
print(df.head())
print(df.columns)

       Time        Date  Sender_account  Receiver_account    Amount  \
0  10:35:19  2022-10-07      8724731955        2769355426   1459.15   
1  10:35:20  2022-10-07      1491989064        8401255335   6019.64   
2  10:35:20  2022-10-07       287305149        4404767002  14328.44   
3  10:35:21  2022-10-07      5376652437        9600420220  11895.00   
4  10:35:21  2022-10-07      9614186178        3803336972    115.25   

  Payment_currency Received_currency Sender_bank_location  \
0        UK pounds         UK pounds                   UK   
1        UK pounds            Dirham                   UK   
2        UK pounds         UK pounds                   UK   
3        UK pounds         UK pounds                   UK   
4        UK pounds         UK pounds                   UK   

  Receiver_bank_location  Payment_type  Is_laundering       Laundering_type  
0                     UK  Cash Deposit              0  Normal_Cash_Deposits  
1                    UAE  Cross-border            

In [2]:
df.shape

(9504852, 12)

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Select relevant features for the model input
feat_cols = ['Amount',
             'Payment_currency', 'Received_currency',
             'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']

cat_cols = ['Payment_currency', 'Received_currency',
            'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']
num_cols = ['Amount']

# Encode categorical columns
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col in cat_cols:
    df[col] = encoders[col].transform(df[col])

# Standardize amount
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

# Convert Date+Time to a datetime if useful (optional)
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df = df.sort_values(['Sender_account', 'DateTime'])


In [5]:
df

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,DateTime
8572082,09:51:28,2023-07-22,9018,2388293593,-0.212529,10,2,16,3,5,0,Normal_Foward,2023-07-22 09:51:28
3210514,23:28:15,2023-01-24,28511,3072405466,-0.093372,10,10,16,16,6,0,Normal_Plus_Mutual,2023-01-24 23:28:15
4191567,23:31:38,2023-02-24,28511,3072405466,-0.190708,10,10,16,16,6,0,Normal_Plus_Mutual,2023-02-24 23:31:38
5018226,20:51:59,2023-03-24,28511,3072405466,-0.181654,10,10,16,16,6,0,Normal_Plus_Mutual,2023-03-24 20:51:59
5938108,19:38:10,2023-04-24,28511,3072405466,-0.063064,10,10,16,16,6,0,Normal_Plus_Mutual,2023-04-24 19:38:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7483420,18:33:08,2023-06-15,9999986843,2812722257,-0.083091,10,10,16,16,6,0,Normal_Fan_In,2023-06-15 18:33:08
7485577,19:46:17,2023-06-15,9999986843,2812722257,-0.081807,10,10,16,16,0,0,Normal_Fan_In,2023-06-15 19:46:17
7487475,20:48:31,2023-06-15,9999986843,2812722257,-0.081641,10,10,16,16,6,0,Normal_Fan_In,2023-06-15 20:48:31
7488709,21:29:01,2023-06-15,9999986843,2812722257,-0.083146,10,10,16,16,3,0,Normal_Fan_In,2023-06-15 21:29:01


In [4]:
MAX_SEQ_LENGTH = 64

def build_sequence(grp):
    seq = grp[feat_cols].values.tolist()
    n = len(seq)
    pad = [[0]*len(feat_cols)]*(MAX_SEQ_LENGTH - n) if n < MAX_SEQ_LENGTH else []
    seq = seq[-MAX_SEQ_LENGTH:] if n > MAX_SEQ_LENGTH else pad + seq
    mask = [1]*min(n, MAX_SEQ_LENGTH) + [0]*max(0, MAX_SEQ_LENGTH-n)
    label = int(grp['Is_laundering'].max())   # Sequence labeled 1 if any transaction laundered
    return seq, mask, label

grouped = df.groupby('Sender_account')
all_seqs, all_masks, all_labels = [], [], []

for _, grp in grouped:
    seq, mask, label = build_sequence(grp)
    all_seqs.append(seq)
    all_masks.append(mask)
    all_labels.append(label)

import numpy as np
X = np.array(all_seqs)                  # [n_samples, seq_len, n_features]
M = np.array(all_masks)                 # [n_samples, seq_len]
y = np.array(all_labels).astype(float)  # [n_samples]
