In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/formatted_transactions.csv')
df.columns = df.columns.str.replace(" ", "_")

# split data into X and y
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [2]:
df.head()

Unnamed: 0,EdgeID,from_id,to_id,Timestamp,Amount_Sent,Sent_Currency,Amount_Received,Received_Currency,Payment_Format,Is_Laundering
0,2,3,3,10,14675.57,0,14675.57,0,0,0
1,17,24,24,10,897.37,0,897.37,0,0,0
2,158,163,163,10,99986.94,0,99986.94,0,0,0
3,218,215,215,10,16.08,0,16.08,0,0,0
4,281,265,265,10,10.3,0,10.3,0,0,0


In [5]:
import pandas as pd
import numpy as np
import itertools

# === Setup ===
# x_df must contain a 'timestamp' column
# y_df must be aligned (same index) and contain one column like 'label'

n_days = (X['Timestamp'].max() - X['Timestamp'].min()) // (24 * 3600) + 1
base_ts = X['Timestamp'].min()

daily_irs = []
weighted_daily_irs = []
daily_inds = []
daily_trans = []

# === Step 1: Get daily stats from x_df and y_df ===
for day in range(n_days):
    l = base_ts + day * 24 * 3600
    r = base_ts + (day + 1) * 24 * 3600
    day_inds = X[(X['Timestamp'] >= l) & (X['Timestamp'] < r)].index
    daily_inds.append(day_inds)
    
    if not day_inds.empty:
        ir = y.loc[day_inds].mean()
        daily_irs.append(ir)
        weighted_daily_irs.append(ir * len(day_inds) / len(X))
        daily_trans.append(len(day_inds))
    else:
        daily_irs.append(0)
        weighted_daily_irs.append(0)
        daily_trans.append(0)

# === Step 2: Find best (i, j) day split based on transaction count ===
split_per = [0.6, 0.2, 0.2]
daily_totals = np.array(daily_trans)
I = list(range(len(daily_totals)))
split_scores = {}

for i, j in itertools.combinations(I, 2):
    if j >= i:
        split_totals = [daily_totals[:i].sum(), daily_totals[i:j].sum(), daily_totals[j:].sum()]
        split_sum = sum(split_totals)
        if split_sum == 0:
            continue
        split_props = [v / split_sum for v in split_totals]
        split_error = [abs(v - t) / t for v, t in zip(split_props, split_per)]
        score = max(split_error)
        split_scores[(i, j)] = score

i, j = min(split_scores, key=split_scores.get)
split = [list(range(i)), list(range(i, j)), list(range(j, n_days))]

print(f"\n→ Day split indices:\nTrain: {split[0][:5]}...\nVal: {split[1][:5]}...\nTest: {split[2][:5]}...")

# === Step 3: Split x_df and y_df ===
split_x = {0: [], 1: [], 2: []}
split_y = {0: [], 1: [], 2: []}

for k in range(3):
    for day in split[k]:
        split_x[k].append(X.loc[daily_inds[day]])
        split_y[k].append(y.loc[daily_inds[day]])

X_train = pd.concat(split_x[0])
X_val   = pd.concat(split_x[1])
X_test  = pd.concat(split_x[2])

y_train = pd.concat(split_y[0])
y_val   = pd.concat(split_y[1])
y_test  = pd.concat(split_y[2])

# === Step 4: Print final stats ===
print(f"\nTrain set: {len(X_train)} samples ({len(X_train)/len(X)*100:.2f}%)")
print(f"Illicit Ratio: {y_train.mean() * 100:.2f}%")

print(f"\nValidation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.2f}%)")
print(f"Illicit Ratio: {y_val.mean() * 100:.2f}%")

print(f"\nTest set: {len(X_test)} samples ({len(X_test)/len(X)*100:.2f}%)")
print(f"Illicit Ratio: {y_test.mean() * 100:.2f}%")



→ Day split indices:
Train: [0, 1, 2, 3, 4]...
Val: [5, 6]...
Test: [7, 8, 9, 10, 11]...

Train set: 2766832 samples (56.19%)
Illicit Ratio: 0.07%

Validation set: 964840 samples (19.59%)
Illicit Ratio: 0.11%

Test set: 1192575 samples (24.22%)
Illicit Ratio: 0.15%


### XGBoost

In [None]:
!pip list | grep xgboost

In [6]:
import numpy as np

xgb_params = {
    "n_estimators": 100,  # num_round
    "max_depth": np.random.randint(1, 16),
    "learning_rate": 10 ** np.random.uniform(-2.5, -1),
    "reg_lambda": 10 ** np.random.uniform(-2, 2),
    "scale_pos_weight": np.random.uniform(1, 10),
    "colsample_bytree": np.random.uniform(0.5, 1.0),
    "subsample": np.random.uniform(0.5, 1.0),
    # "use_label_encoder": False,
    "eval_metric": "logloss",
    "device": "cuda:0",
    "tree_method": "hist"
}

print(xgb_params)

{'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.008949439451514224, 'reg_lambda': 0.17681136818379248, 'scale_pos_weight': 9.129018800251673, 'colsample_bytree': 0.6540503980068026, 'subsample': 0.9210649530595751, 'eval_metric': 'logloss'}


In [19]:
# fit model no training data
xgb_model = XGBClassifier(**params)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

[0]	validation_0-logloss:0.61883
[1]	validation_0-logloss:0.55523
[2]	validation_0-logloss:0.50035
[3]	validation_0-logloss:0.45214
[4]	validation_0-logloss:0.40973
[5]	validation_0-logloss:0.37215
[6]	validation_0-logloss:0.33871
[7]	validation_0-logloss:0.30893
[8]	validation_0-logloss:0.28231
[9]	validation_0-logloss:0.25851
[10]	validation_0-logloss:0.23685
[11]	validation_0-logloss:0.21779
[12]	validation_0-logloss:0.20025
[13]	validation_0-logloss:0.18399
[14]	validation_0-logloss:0.16924
[15]	validation_0-logloss:0.15598
[16]	validation_0-logloss:0.14375
[17]	validation_0-logloss:0.13263
[18]	validation_0-logloss:0.12237
[19]	validation_0-logloss:0.11300
[20]	validation_0-logloss:0.10449
[21]	validation_0-logloss:0.09678
[22]	validation_0-logloss:0.08952
[23]	validation_0-logloss:0.08320
[24]	validation_0-logloss:0.07692
[25]	validation_0-logloss:0.07154
[26]	validation_0-logloss:0.06626
[27]	validation_0-logloss:0.06152
[28]	validation_0-logloss:0.05715
[29]	validation_0-loglos

In [20]:
y_pred = xgb_model.predict(X_test)

In [22]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)*100
print(f1)

10.847814639283834


### LightGBM

In [16]:
!pip install lightgbm --install-option=--gpu


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: --install-option


In [None]:
!pip list | grep lightgbm

In [14]:
import numpy as np

lgbm_params = {
    'n_estimators': 100,
    'num_leaves': np.random.randint(32, 256),
    'learning_rate': 10 ** np.random.uniform(-2.5, -1),  # (0.00316 to 0.1)
    'reg_lambda': 10 ** np.random.uniform(-2, 2),        # lambda_l2: (0.01 to 100)
    'reg_alpha': 10 ** np.random.uniform(0.01, 0.5),     # lambda_l1: (1.02 to ~3.16)
    'scale_pos_weight': np.random.uniform(1, 10),        # class imbalance handling
    'force_col_wise': True,
    'device': 'cuda:0'
}

print(lgbm_params)

{'n_estimators': 100, 'num_leaves': 151, 'learning_rate': 0.005406818158646124, 'reg_lambda': 0.11657290823283885, 'reg_alpha': 1.4133499580844153, 'scale_pos_weight': 9.975800657094542, 'force_col_wise': True, 'device': 'cuda:0'}


In [15]:
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

lgbm_model = LGBMClassifier(objective='binary', boosting_type='gbdt', random_state=42, **lgbm_params)

lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[
        early_stopping(stopping_rounds=20),
        log_evaluation(1)  # set to 1 for logs every round
    ]
)

[LightGBM] [Fatal] Unknown device type cuda:0


LightGBMError: Unknown device type cuda:0

In [10]:
y_pred = lgbm_model.predict(X_test)

In [11]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)*100
print(f1)

11.382925611582625
