## This code below is only for running on Kaggle

In [1]:
import torch

print(torch.cuda.is_available())

True


In [2]:
!nvidia-smi

Sat Apr 19 16:54:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

In [3]:
!apt-get install g++




g++ is already the newest version (4:11.2.0-1ubuntu1).
g++ set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 129 not upgraded.


In [4]:
!pip install cupy-cuda12x



### Use formatted_transctions

In [5]:
import pandas as pd

df = pd.read_csv('/kaggle/input/formatted-transactions/formatted_transactions.csv')

# split data into X and y
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
for (idx, col) in enumerate(X.columns):
    print(idx, col)

0 EdgeID
1 from_id
2 to_id
3 Timestamp
4 Amount Sent
5 Sent Currency
6 Amount Received
7 Received Currency
8 Payment Format


In [7]:
# split data
import pandas as pd
import numpy as np
import itertools

# === Setup ===
# x_df must contain a 'timestamp' column
# y_df must be aligned (same index) and contain one column like 'label'

n_days = (X['Timestamp'].max() - X['Timestamp'].min()) // (24 * 3600) + 1
base_ts = X['Timestamp'].min()

daily_irs = []
weighted_daily_irs = []
daily_inds = []
daily_trans = []

# === Step 1: Get daily stats from x_df and y_df ===
for day in range(n_days):
    l = base_ts + day * 24 * 3600
    r = base_ts + (day + 1) * 24 * 3600
    day_inds = X[(X['Timestamp'] >= l) & (X['Timestamp'] < r)].index
    daily_inds.append(day_inds)
    
    if not day_inds.empty:
        ir = y.loc[day_inds].mean()
        daily_irs.append(ir)
        weighted_daily_irs.append(ir * len(day_inds) / len(X))
        daily_trans.append(len(day_inds))
    else:
        daily_irs.append(0)
        weighted_daily_irs.append(0)
        daily_trans.append(0)

# === Step 2: Find best (i, j) day split based on transaction count ===
split_per = [0.6, 0.2, 0.2]
daily_totals = np.array(daily_trans)
I = list(range(len(daily_totals)))
split_scores = {}

for i, j in itertools.combinations(I, 2):
    if j >= i:
        split_totals = [daily_totals[:i].sum(), daily_totals[i:j].sum(), daily_totals[j:].sum()]
        split_sum = sum(split_totals)
        if split_sum == 0:
            continue
        split_props = [v / split_sum for v in split_totals]
        split_error = [abs(v - t) / t for v, t in zip(split_props, split_per)]
        score = max(split_error)
        split_scores[(i, j)] = score

i, j = min(split_scores, key=split_scores.get)
split = [list(range(i)), list(range(i, j)), list(range(j, n_days))]

print(f"\n→ Day split indices:\nTrain: {split[0][:5]}...\nVal: {split[1][:5]}...\nTest: {split[2][:5]}...")

# === Step 3: Split x_df and y_df ===
split_x = {0: [], 1: [], 2: []}
split_y = {0: [], 1: [], 2: []}

for k in range(3):
    for day in split[k]:
        split_x[k].append(X.loc[daily_inds[day]])
        split_y[k].append(y.loc[daily_inds[day]])

X_train = pd.concat(split_x[0])
X_val   = pd.concat(split_x[1])
X_test  = pd.concat(split_x[2])

y_train = pd.concat(split_y[0])
y_val   = pd.concat(split_y[1])
y_test  = pd.concat(split_y[2])

# === Step 4: Print final stats ===
print(f"\nTrain set: {len(X_train)} samples ({len(X_train)/len(X)*100:.2f}%)")
print(f"Illicit Ratio: {y_train.mean() * 100:.2f}%")

print(f"\nValidation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.2f}%)")
print(f"Illicit Ratio: {y_val.mean() * 100:.2f}%")

print(f"\nTest set: {len(X_test)} samples ({len(X_test)/len(X)*100:.2f}%)")
print(f"Illicit Ratio: {y_test.mean() * 100:.2f}%")



→ Day split indices:
Train: [0, 1, 2, 3, 4]...
Val: [5, 6]...
Test: [7, 8, 9, 10, 11]...

Train set: 2766832 samples (56.19%)
Illicit Ratio: 0.07%

Validation set: 964840 samples (19.59%)
Illicit Ratio: 0.11%

Test set: 1192575 samples (24.22%)
Illicit Ratio: 0.15%


### Graph Features Processor (GFP)

In [8]:
!pip list | grep snapml

In [9]:
!pip install snapml

Collecting snapml
  Downloading snapml-1.16.3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Downloading snapml-1.16.3-cp310-cp310-manylinux_2_28_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: snapml
Successfully installed snapml-1.16.3


In [10]:
# import pandas as pd
# import numpy as np
# import networkx as nx

# def compute_graph_params(csv_path="/kaggle/input/formatted-transactions/formatted_transactions.csv"):
#     df = pd.read_csv(csv_path)

#     # Giữ nguyên Timestamp dạng giây
#     df["timestamp"] = df["Timestamp"]
#     df["account_out"] = df["from_id"].astype(str)
#     df["account_in"] = df["to_id"].astype(str)
#     df["value"] = pd.to_numeric(df["Amount Sent"], errors="coerce").fillna(0)

#     # --- A. Fan bins ---
#     fan_out = df.groupby("account_out")["account_in"].nunique()
#     fan_in = df.groupby("account_in")["account_out"].nunique()
#     fan_total = fan_out.add(fan_in, fill_value=0)
#     fan_bins = np.quantile(fan_total, [0.33, 0.66]).astype(int).tolist() if len(fan_total) >= 3 else []

#     # --- B. Degree bins ---
#     deg_out = df.groupby("account_out").size()
#     deg_in = df.groupby("account_in").size()
#     deg_total = deg_out.add(deg_in, fill_value=0)
#     degree_bins = np.quantile(deg_total, [0.33, 0.66]).astype(int).tolist() if len(deg_total) >= 3 else []

#     # --- C. Scatter-Gather bins ---
#     sg_scatter = df.groupby("account_out")["account_in"].nunique()
#     sg_gather = df.groupby("account_in")["account_out"].nunique()
#     sg_total = sg_scatter.add(sg_gather, fill_value=0)
#     scatter_gather_bins = np.quantile(sg_total, [0.33, 0.66]).astype(int).tolist() if len(sg_total) >= 3 else []

#     # --- D. Temporal cycle bins ---
#     df["pair"] = df["account_out"] + "->" + df["account_in"]
#     pair_counts = df.groupby("pair")["timestamp"].count()
#     pair_counts_filtered = pair_counts[pair_counts > 1]
#     temp_cycle_bins = (
#         np.quantile(pair_counts_filtered, [0.33, 0.66]).astype(int).tolist()
#         if len(pair_counts_filtered) >= 3 else []
#     )

#     # --- E. Length-constrained cycle bins ---
#     G = nx.from_pandas_edgelist(df, source="account_out", target="account_in", create_using=nx.DiGraph)
#     cycle_count = {node: 0 for node in G.nodes}
#     for c in nx.simple_cycles(G):
#         if len(c) <= 10:
#             for node in c:
#                 cycle_count[node] += 1
#     cycle_series = pd.Series(cycle_count)
#     cycle_series_filtered = cycle_series[cycle_series > 0]
#     lc_cycle_bins = (
#         np.quantile(cycle_series_filtered, [0.33, 0.66]).astype(int).tolist()
#         if len(cycle_series_filtered) >= 3 else []
#     )

#     # --- Final config ---
#     params = {
#         "num_threads": 4,
#         "time_window": 86400,  # 1 day in seconds

#         "vertex_stats": True,
#         "vertex_stats_cols": [df.columns.get_loc("Amount Sent")],
#         "vertex_stats_feats": [0, 1, 3, 4, 8, 9],

#         "fan": True,
#         "fan_tw": 86400,  # 1 day
#         "fan_bins": fan_bins,

#         "degree": True,
#         "degree_tw": 86400,
#         "degree_bins": degree_bins,

#         "scatter-gather": True,
#         "scatter-gather_tw": 21600,  # 6 hours
#         "scatter-gather_bins": scatter_gather_bins,

#         "temp-cycle": True,
#         "temp-cycle_tw": 86400,
#         "temp-cycle_bins": temp_cycle_bins,

#         "lc-cycle": True,
#         "lc-cycle_tw": 86400,
#         "lc-cycle_len": 10,
#         "lc-cycle_bins": lc_cycle_bins,
#     }

#     return params

# params = compute_graph_params()
# print(params)


In [11]:
# sample params config
params = {
    "num_threads": 4,                # Sử dụng 4 luồng để xử lý song song (tùy máy, có thể tăng)
    "time_window": 24,              # Mặc định: 1 ngày nếu không có chỉ định cụ thể

    # Vertex statistics
    "vertex_stats": True,
    "vertex_stats_cols": [4, 3],    # Cột 4 = Amount Paid, cột 3 = Timestamp_float
    "vertex_stats_feats": [0, 1, 2, 3, 4, 8, 9, 10],  # fan, degree, ratio, avg, sum, var, skew, kurtosis

    # Fan-in/out
    "fan": True,
    "fan_tw": 24,                   # 1 ngày
    "fan_bins": [2, 3, 5, 7, 13, 17, 23],             # Placeholder – cần tinh chỉnh sau phân tích dữ liệu

    # In/out degree
    "degree": True,
    "degree_tw": 24,                # 1 ngày
    "degree_bins": [2, 3, 5, 7, 13, 17, 23],          # Placeholder

    # Scatter-Gather pattern
    "scatter-gather": True,
    "scatter-gather_tw": 6,         # 6 giờ như mô tả paper
    "scatter-gather_bins": [2, 3, 5, 7, 13, 17, 23],  # Placeholder

    # Temporal cycle
    "temp-cycle": True,
    "temp-cycle_tw": 24,            # 1 ngày
    "temp-cycle_bins": [2, 3, 5, 7, 13],      # Placeholder

    # Simple cycle (length-constrained)
    "lc-cycle": True,
    "lc-cycle_tw": 24,              # 1 ngày
    "lc-cycle_len": 10,             # Chu trình tối đa độ dài 10
    "lc-cycle_bins": [2, 3, 5, 7, 11],        # Placeholder
}

In [12]:
from snapml import GraphFeaturePreprocessor

gfp = GraphFeaturePreprocessor()

gfp.set_params(params)

In [13]:
print(gfp.get_params())

{'num_threads': 4, 'time_window': 24, 'max_no_edges': -1, 'vertex_stats': True, 'vertex_stats_tw': 1728000, 'vertex_stats_cols': [4, 3], 'vertex_stats_feats': [0, 1, 2, 3, 4, 8, 9, 10], 'fan': True, 'fan_tw': 24, 'fan_bins': [2, 3, 5, 7, 13, 17, 23], 'degree': True, 'degree_tw': 24, 'degree_bins': [2, 3, 5, 7, 13, 17, 23], 'scatter-gather': True, 'scatter-gather_tw': 6, 'scatter-gather_bins': [2, 3, 5, 7, 13, 17, 23], 'temp-cycle': True, 'temp-cycle_tw': 24, 'temp-cycle_bins': [2, 3, 5, 7, 13], 'lc-cycle': True, 'lc-cycle_tw': 24, 'lc-cycle_len': 10, 'lc-cycle_bins': [2, 3, 5, 7, 11]}


In [14]:
# use numpy --> this can cause the problem of not using GPU
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()
# y_train = y_train.to_numpy()

In [15]:
X_train_enriched = gfp.fit_transform(X_train)
X_val_enriched = gfp.fit_transform(X_val)
X_test_enriched = gfp.fit_transform(X_test)

In [16]:
# check length
print(len(X_train_enriched[0]), len(X_val_enriched[0]), len(X_test_enriched[0]))

106 106 106


### Train GBT models

In [17]:
import numpy as np

xgb_params = {
    "n_estimators": 100,  # num_round
    "max_depth": np.random.randint(10, 16),
    "learning_rate": 10 ** np.random.uniform(-2.5, -1),
    "reg_lambda": 10 ** np.random.uniform(-2, 2),
    "scale_pos_weight": np.random.uniform(1, 10),
    "colsample_bytree": np.random.uniform(0.5, 1.0),
    "subsample": np.random.uniform(0.5, 1.0),
    # "use_label_encoder": False,
    "eval_metric": "logloss",
    "device": "cuda:0",
    "tree_method": "hist"
}

print(xgb_params)

{'n_estimators': 100, 'max_depth': 14, 'learning_rate': 0.014214574630449237, 'reg_lambda': 51.51230440733439, 'scale_pos_weight': 8.64664535499335, 'colsample_bytree': 0.8392640136139451, 'subsample': 0.5178863314753541, 'eval_metric': 'logloss', 'device': 'cuda:0', 'tree_method': 'hist'}


In [18]:
# fit model no training data
from xgboost import XGBClassifier

xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X_train_enriched, y_train, eval_set=[(X_val_enriched, y_val)])

[0]	validation_0-logloss:0.13091
[1]	validation_0-logloss:0.12992
[2]	validation_0-logloss:0.13109
[3]	validation_0-logloss:0.13146
[4]	validation_0-logloss:0.13001
[5]	validation_0-logloss:0.12902
[6]	validation_0-logloss:0.12776
[7]	validation_0-logloss:0.12669
[8]	validation_0-logloss:0.12673
[9]	validation_0-logloss:0.12710
[10]	validation_0-logloss:0.12676
[11]	validation_0-logloss:0.12605
[12]	validation_0-logloss:0.12484
[13]	validation_0-logloss:0.12400
[14]	validation_0-logloss:0.12440
[15]	validation_0-logloss:0.12304
[16]	validation_0-logloss:0.12300
[17]	validation_0-logloss:0.12467
[18]	validation_0-logloss:0.12372
[19]	validation_0-logloss:0.12297
[20]	validation_0-logloss:0.12219
[21]	validation_0-logloss:0.12131
[22]	validation_0-logloss:0.12072
[23]	validation_0-logloss:0.12011
[24]	validation_0-logloss:0.11896
[25]	validation_0-logloss:0.11764
[26]	validation_0-logloss:0.11683
[27]	validation_0-logloss:0.11651
[28]	validation_0-logloss:0.11543
[29]	validation_0-loglos

In [19]:
import cupy as cp

X_test_enriched = cp.asarray(X_test_enriched)

y_pred = xgb_model.predict(X_test_enriched)

In [20]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)*100
print(f1)

3.882418191902385
