```
Copyright 2022 IBM Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

# Graph Features Extraction for Anti-Money Laudering

The Snap ML GraphFeaturePreprocessor is a scikit-learn compatible preprocessor that enables scalable and real-time feature extraction from graph-structured data. It provides utilities for creating and updating in-memory graphs as well as extracting new features from these graphs. The goal of this example is to show how to use the API of this preprocessor. As input, we will use a synthethic dataset in tabular format where each row represents a financial transaction. For each transaction 4 features are available: transaction ID, source account ID, target accound ID and transaction timestamp. 

In [1]:
# Import the Graph Feature Preprocessor from Snap ML
from snapml import GraphFeaturePreprocessor

# Import other libraries
import numpy as np
import time
import json
import pandas as pd
from IPython.display import display
import datetime

pd.options.display.max_columns = None

In [2]:
mon_laun_path = "../datasets/mon_laun_dataset/HI-Small_Trans.csv"
"""
0+10
01+0
"""
mon_laun_df = pd.read_csv(mon_laun_path)

mon_laun_df["transactionID"] = mon_laun_df.index.astype(float)

mon_laun_df["sourceAccount"] = (mon_laun_df["From Bank"].astype(str) + mon_laun_df["Account"])
mon_laun_df["targetAccount"] = mon_laun_df["To Bank"].astype(str) + mon_laun_df["Account.1"] 

unique_id_df = mon_laun_df[["sourceAccount","targetAccount"]]

x = unique_id_df.stack()
x[:] = x.factorize()[0]
mon_laun_df = mon_laun_df.join(x.unstack().add_suffix('ID'))

display(mon_laun_df)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,transactionID,sourceAccount,targetAccount,sourceAccountID,targetAccountID
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.340000,US Dollar,3697.340000,US Dollar,Reinvestment,0,0.0,108000EBD30,108000EBD30,0,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.010000,US Dollar,0.010000,US Dollar,Cheque,0,1.0,32088000F4580,18000F5340,1,2
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.570000,US Dollar,14675.570000,US Dollar,Reinvestment,0,2.0,32098000F4670,32098000F4670,3,3
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.970000,US Dollar,2806.970000,US Dollar,Reinvestment,0,3.0,128000F5030,128000F5030,4,4
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.970000,US Dollar,36682.970000,US Dollar,Reinvestment,0,4.0,108000F5200,108000F5200,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5078340,2022/09/10 23:57,54219,8148A6631,256398,8148A8711,0.154978,Bitcoin,0.154978,Bitcoin,Bitcoin,0,5078340.0,542198148A6631,2563988148A8711,501188,264859
5078341,2022/09/10 23:35,15,8148A8671,256398,8148A8711,0.108128,Bitcoin,0.108128,Bitcoin,Bitcoin,0,5078341.0,158148A8671,2563988148A8711,368073,264859
5078342,2022/09/10 23:52,154365,8148A6771,256398,8148A8711,0.004988,Bitcoin,0.004988,Bitcoin,Bitcoin,0,5078342.0,1543658148A6771,2563988148A8711,264858,264859
5078343,2022/09/10 23:46,256398,8148A6311,256398,8148A8711,0.038417,Bitcoin,0.038417,Bitcoin,Bitcoin,0,5078343.0,2563988148A6311,2563988148A8711,298200,264859


In [3]:



"""mon_laun_df = mon_laun_df[mon_laun_df["sourceAccountID"] != mon_laun_df["targetAccountID"]]"""



mon_laun_df["Timestamp"] = pd.to_datetime(mon_laun_df["Timestamp"], format='%Y/%m/%d %H:%M')
mon_laun_df["timestamp"] = (mon_laun_df["Timestamp"].dt.hour*60+mon_laun_df["Timestamp"].dt.minute).astype(float)
mon_laun_df = mon_laun_df[mon_laun_df["transactionID"] < 5000.0]


display(mon_laun_df)

mon_laun_temp = mon_laun_df
mon_laun_df = mon_laun_df.drop(columns = ['Timestamp', "From Bank", "Account", "To Bank", "sourceAccount", "targetAccount", \
                            "Account.1","Amount Received", "Receiving Currency", "Amount Paid","Payment Currency", "Payment Format", "Is Laundering"])



mon_laun_arr = mon_laun_df.to_numpy().astype(float)
display(mon_laun_arr)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,transactionID,sourceAccount,targetAccount,sourceAccountID,targetAccountID,timestamp
0,2022-09-01 00:20:00,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0,0.0,108000EBD30,108000EBD30,0,0,20.0
1,2022-09-01 00:20:00,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0,1.0,32088000F4580,18000F5340,1,2,20.0
2,2022-09-01 00:00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0,2.0,32098000F4670,32098000F4670,3,3,0.0
3,2022-09-01 00:02:00,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0,3.0,128000F5030,128000F5030,4,4,2.0
4,2022-09-01 00:06:00,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0,4.0,108000F5200,108000F5200,5,5,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2022-09-01 00:02:00,1674,80087D1B0,1674,80087D1B0,12.19,US Dollar,12.19,US Dollar,Reinvestment,0,4995.0,167480087D1B0,167480087D1B0,4184,4184,2.0
4996,2022-09-01 00:19:00,220,80087EA60,220,80087EA60,2075.73,US Dollar,2075.73,US Dollar,Reinvestment,0,4996.0,22080087EA60,22080087EA60,4185,4185,19.0
4997,2022-09-01 00:19:00,2843,8008789D0,2843,8008789D0,604.51,US Dollar,604.51,US Dollar,Reinvestment,0,4997.0,28438008789D0,28438008789D0,4186,4186,19.0
4998,2022-09-01 00:29:00,1674,80087AC10,1674,80087AC10,13.93,US Dollar,13.93,US Dollar,Reinvestment,0,4998.0,167480087AC10,167480087AC10,4187,4187,29.0


array([[0.000e+00, 0.000e+00, 0.000e+00, 2.000e+01],
       [1.000e+00, 1.000e+00, 2.000e+00, 2.000e+01],
       [2.000e+00, 3.000e+00, 3.000e+00, 0.000e+00],
       ...,
       [4.997e+03, 4.186e+03, 4.186e+03, 1.900e+01],
       [4.998e+03, 4.187e+03, 4.187e+03, 2.900e+01],
       [4.999e+03, 4.188e+03, 4.188e+03, 7.000e+00]])

Here we assume that the user has access to a set of (labeled) transactions with raw features which could be used to train a machine learning (ML) model, e.g., for fraud detection. The user will extract graph features using the Graph Features Preprocessor which will be added to the initial raw features present in the transactions. The enriched set of features will be used to train an ML model. The main steps associated with this use case are shown below:

<div> <img src="img/gfp-use-case1.png" width="1000"> </div>


In [4]:
# Path to the file that contains financial transactions, e.g., used for training ML models
train_graph_path = "../datasets/graph_feature_preprocessor/aml_custom_train.txt"

print("Loading the transactions ")
"""X_train = np.loadtxt(train_graph_path, dtype=np.float64, delimiter=" ", comments="#", usecols=range(4))"""
X_train = mon_laun_arr
print("Input dataset shape: ", X_train.shape)

df = pd.DataFrame(X_train, columns=['transactionID', 'sourceAccountID', 'targetAccountID', 'timestamp'])
display(df)
df.dtypes

Loading the transactions 
Input dataset shape:  (5000, 4)


Unnamed: 0,transactionID,sourceAccountID,targetAccountID,timestamp
0,0.0,0.0,0.0,20.0
1,1.0,1.0,2.0,20.0
2,2.0,3.0,3.0,0.0
3,3.0,4.0,4.0,2.0
4,4.0,5.0,5.0,6.0
...,...,...,...,...
4995,4995.0,4184.0,4184.0,2.0
4996,4996.0,4185.0,4185.0,19.0
4997,4997.0,4186.0,4186.0,19.0
4998,4998.0,4187.0,4187.0,29.0


transactionID      float64
sourceAccountID    float64
targetAccountID    float64
timestamp          float64
dtype: object

In [5]:
# The following dictionary defines the configuration parameters of the Graph Feature Preprocessor

params = {
    "num_threads": 4,             # number of software threads to be used (important for performance)
    "time_window": 16,            # time window used if no pattern was specified
    
    "vertex_stats": True,         # produce vertex statistics
    "vertex_stats_cols": [3],     # produce vertex statistics using the selected input columns
    
    # features: 0:fan,1:deg,2:ratio,3:avg,4:sum,5:min,6:max,7:median,8:var,9:skew,10:kurtosis
    "vertex_stats_feats": [0, 1, 2, 3, 4, 8, 9, 10],  # fan,deg,ratio,avg,sum,var,skew,kurtosis
    
    # fan in/out parameters
    "fan": True,
    "fan_tw": 16,
    "fan_bins": [y+2 for y in range(2)],
    
    # in/out degree parameters
    "degree": True,
    "degree_tw": 16,
    "degree_bins": [y+2 for y in range(2)],
    
    # scatter gather parameters
    "scatter-gather": True,
    "scatter-gather_tw": 16,
    "scatter-gather_bins": [y+2 for y in range(2)],
    
    # temporal cycle parameters
    "temp-cycle": True,
    "temp-cycle_tw": 16,
    "temp-cycle_bins": [y+2 for y in range(2)],
    
    # length-constrained simple cycle parameters
    "lc-cycle": False,
    "lc-cycle_tw": 16,
    "lc-cycle_len": 8,
    "lc-cycle_bins": [y+2 for y in range(2)],
}

In [6]:
# Create a Graph Feature Preprocessor, set its configuration using the above dictionary and verify it

print("Creating a graph feature preprocessor ")
gp = GraphFeaturePreprocessor()

print("Setting the parameters of the graph feature preprocessor ")
gp.set_params(params)

print("Graph feature preprocessor parameters: ", json.dumps(gp.get_params(), indent=4))

Creating a graph feature preprocessor 
Setting the parameters of the graph feature preprocessor 
Graph feature preprocessor parameters:  {
    "num_threads": 4,
    "time_window": 16,
    "max_no_edges": -1,
    "vertex_stats": true,
    "vertex_stats_tw": 1728000,
    "vertex_stats_cols": [
        3
    ],
    "vertex_stats_feats": [
        0,
        1,
        2,
        3,
        4,
        8,
        9,
        10
    ],
    "fan": true,
    "fan_tw": 16,
    "fan_bins": [
        2,
        3
    ],
    "degree": true,
    "degree_tw": 16,
    "degree_bins": [
        2,
        3
    ],
    "scatter-gather": true,
    "scatter-gather_tw": 16,
    "scatter-gather_bins": [
        2,
        3
    ],
    "temp-cycle": true,
    "temp-cycle_tw": 16,
    "temp-cycle_bins": [
        2,
        3
    ],
    "lc-cycle": false,
    "lc-cycle_tw": 16,
    "lc-cycle_len": 8,
    "lc-cycle_bins": [
        2,
        3
    ]
}


In [7]:
print(X_train)
display(X_train.shape)
X_train.dtype

[[0.000e+00 0.000e+00 0.000e+00 2.000e+01]
 [1.000e+00 1.000e+00 2.000e+00 2.000e+01]
 [2.000e+00 3.000e+00 3.000e+00 0.000e+00]
 ...
 [4.997e+03 4.186e+03 4.186e+03 1.900e+01]
 [4.998e+03 4.187e+03 4.187e+03 2.900e+01]
 [4.999e+03 4.188e+03 4.188e+03 7.000e+00]]


(5000, 4)

dtype('float64')

In [8]:
print("Enriching the transactions with new graph features ")
print("Raw dataset shape: ", X_train.shape)

# the fit_transform and transform functions are equivalent
# these functions can run on single transactions or on batches of transactions
X_train = np.ascontiguousarray(X_train)
X_train_enriched = gp.fit_transform(X_train.astype("float64")) 

print("Enriched dataset shape: ", X_train_enriched.shape)

Enriching the transactions with new graph features 
Raw dataset shape:  (5000, 4)
Enriched dataset shape:  (5000, 48)


We define a helper function to inspect the newly generated graph-based features for a given transaction:

In [9]:
print(X_train_enriched)

[[0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [1.000e+00 1.000e+00 2.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.000e+00 3.000e+00 3.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [4.997e+03 4.186e+03 4.186e+03 ... 0.000e+00 0.000e+00 0.000e+00]
 [4.998e+03 4.187e+03 4.187e+03 ... 0.000e+00 0.000e+00 0.000e+00]
 [4.999e+03 4.188e+03 4.188e+03 ... 0.000e+00 0.000e+00 0.000e+00]]


In [10]:
def print_enriched_transaction(transaction, params):
    colnames = []
    
    # add raw features names
    colnames.append("transactionID")
    colnames.append("sourceAccountID")
    colnames.append("targetAccountID")
    colnames.append("timestamp")
    
    # add features names for the graph patterns
    for pattern in ['fan', 'degree', 'scatter-gather', 'temp-cycle', 'lc-cycle']:
        if pattern in params:
            if params[pattern]:
                bins = len(params[pattern +'_bins'])
                if pattern in ['fan', 'degree']:
                    for i in range(bins-1):
                        colnames.append(pattern+"_in_bins_"+str(params[pattern +'_bins'][i])+"-"+str(params[pattern +'_bins'][i+1]))
                    colnames.append(pattern+"_in_bins_"+str(params[pattern +'_bins'][i+1])+"-inf")
                    for i in range(bins-1):
                        colnames.append(pattern+"_out_bins_"+str(params[pattern +'_bins'][i])+"-"+str(params[pattern +'_bins'][i+1]))
                    colnames.append(pattern+"_out_bins_"+str(params[pattern +'_bins'][i+1])+"-inf")
                else:
                    for i in range(bins-1):
                        colnames.append(pattern+"_bins_"+str(params[pattern +'_bins'][i])+"-"+str(params[pattern +'_bins'][i+1]))
                    colnames.append(pattern+"_bins_"+str(params[pattern +'_bins'][i+1])+"-inf")

    vert_feat_names = ["fan","deg","ratio","avg","sum","min","max","median","var","skew","kurtosis"]

    # add features names for the vertex statistics
    for orig in ['source', 'dest']:
        for direction in ['out', 'in']:
            # add fan, deg, and ratio features
            for k in [0, 1, 2]:
                if k in params["vertex_stats_feats"]:
                    feat_name = orig + "_" + vert_feat_names[k] + "_" + direction
                    colnames.append(feat_name)
            for col in params["vertex_stats_cols"]:
                # add avg, sum, min, max, median, var, skew, and kurtosis features
                for k in [3, 4, 5, 6, 7, 8, 9, 10]:
                    if k in params["vertex_stats_feats"]:
                        feat_name = orig + "_" + vert_feat_names[k] + "_col" + str(col) + "_" + direction
                        colnames.append(feat_name)

    df = pd.DataFrame(transaction, columns=colnames)
    return df

In [11]:
print("Enriched transactions: ")
df_enriched = print_enriched_transaction(X_train_enriched, gp.get_params())
mon_laun_temp = mon_laun_temp.drop(columns = ['sourceAccountID', 'targetAccountID', 'timestamp'])


display(df_enriched)


Enriched transactions: 


Unnamed: 0,transactionID,sourceAccountID,targetAccountID,timestamp,fan_in_bins_2-3,fan_in_bins_3-inf,fan_out_bins_2-3,fan_out_bins_3-inf,degree_in_bins_2-3,degree_in_bins_3-inf,degree_out_bins_2-3,degree_out_bins_3-inf,scatter-gather_bins_2-3,scatter-gather_bins_3-inf,temp-cycle_bins_2-3,temp-cycle_bins_3-inf,source_fan_out,source_deg_out,source_ratio_out,source_avg_col3_out,source_sum_col3_out,source_var_col3_out,source_skew_col3_out,source_kurtosis_col3_out,source_fan_in,source_deg_in,source_ratio_in,source_avg_col3_in,source_sum_col3_in,source_var_col3_in,source_skew_col3_in,source_kurtosis_col3_in,dest_fan_out,dest_deg_out,dest_ratio_out,dest_avg_col3_out,dest_sum_col3_out,dest_var_col3_out,dest_skew_col3_out,dest_kurtosis_col3_out,dest_fan_in,dest_deg_in,dest_ratio_in,dest_avg_col3_in,dest_sum_col3_in,dest_var_col3_in,dest_skew_col3_in,dest_kurtosis_col3_in
0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0
1,1.0,1.0,2.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0
2,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
3,3.0,4.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0
4,4.0,5.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995.0,4184.0,4184.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0
4996,4996.0,4185.0,4185.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0
4997,4997.0,4186.0,4186.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0
4998,4998.0,4187.0,4187.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0


In [12]:
display(mon_laun_temp)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,transactionID,sourceAccount,targetAccount
0,2022-09-01 00:20:00,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0,0.0,108000EBD30,108000EBD30
1,2022-09-01 00:20:00,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0,1.0,32088000F4580,18000F5340
2,2022-09-01 00:00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0,2.0,32098000F4670,32098000F4670
3,2022-09-01 00:02:00,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0,3.0,128000F5030,128000F5030
4,2022-09-01 00:06:00,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0,4.0,108000F5200,108000F5200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2022-09-01 00:02:00,1674,80087D1B0,1674,80087D1B0,12.19,US Dollar,12.19,US Dollar,Reinvestment,0,4995.0,167480087D1B0,167480087D1B0
4996,2022-09-01 00:19:00,220,80087EA60,220,80087EA60,2075.73,US Dollar,2075.73,US Dollar,Reinvestment,0,4996.0,22080087EA60,22080087EA60
4997,2022-09-01 00:19:00,2843,8008789D0,2843,8008789D0,604.51,US Dollar,604.51,US Dollar,Reinvestment,0,4997.0,28438008789D0,28438008789D0
4998,2022-09-01 00:29:00,1674,80087AC10,1674,80087AC10,13.93,US Dollar,13.93,US Dollar,Reinvestment,0,4998.0,167480087AC10,167480087AC10


This newly enriched set of transactions can now be used to train a ML model. Once trained, the model can be used for prediction (e.g., detect anomalies) on new (unlabeled) transactions. The main steps associated with this use case is shown below:

<div> <img src="img/gfp-use-case2.png" width="1000"> </div>

In [13]:

mon_laun_temp['transactionID'].nunique()

5000

In [14]:

df_xgboost = pd.merge(df_enriched, mon_laun_temp, on='transactionID')
display(df_xgboost)

Unnamed: 0,transactionID,sourceAccountID,targetAccountID,timestamp,fan_in_bins_2-3,fan_in_bins_3-inf,fan_out_bins_2-3,fan_out_bins_3-inf,degree_in_bins_2-3,degree_in_bins_3-inf,degree_out_bins_2-3,degree_out_bins_3-inf,scatter-gather_bins_2-3,scatter-gather_bins_3-inf,temp-cycle_bins_2-3,temp-cycle_bins_3-inf,source_fan_out,source_deg_out,source_ratio_out,source_avg_col3_out,source_sum_col3_out,source_var_col3_out,source_skew_col3_out,source_kurtosis_col3_out,source_fan_in,source_deg_in,source_ratio_in,source_avg_col3_in,source_sum_col3_in,source_var_col3_in,source_skew_col3_in,source_kurtosis_col3_in,dest_fan_out,dest_deg_out,dest_ratio_out,dest_avg_col3_out,dest_sum_col3_out,dest_var_col3_out,dest_skew_col3_out,dest_kurtosis_col3_out,dest_fan_in,dest_deg_in,dest_ratio_in,dest_avg_col3_in,dest_sum_col3_in,dest_var_col3_in,dest_skew_col3_in,dest_kurtosis_col3_in,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,sourceAccount,targetAccount
0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,2022-09-01 00:20:00,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0,108000EBD30,108000EBD30
1,1.0,1.0,2.0,20.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,20.0,40.0,0.0,0.0,0.0,2022-09-01 00:20:00,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0,32088000F4580,18000F5340
2,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,2022-09-01 00:00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0,32098000F4670,32098000F4670
3,3.0,4.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,2022-09-01 00:02:00,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0,128000F5030,128000F5030
4,4.0,5.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,1.0,2.0,2.0,6.0,12.0,0.0,0.0,0.0,2022-09-01 00:06:00,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0,108000F5200,108000F5200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995.0,4184.0,4184.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,2022-09-01 00:02:00,1674,80087D1B0,1674,80087D1B0,12.19,US Dollar,12.19,US Dollar,Reinvestment,0,167480087D1B0,167480087D1B0
4996,4996.0,4185.0,4185.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,2022-09-01 00:19:00,220,80087EA60,220,80087EA60,2075.73,US Dollar,2075.73,US Dollar,Reinvestment,0,22080087EA60,22080087EA60
4997,4997.0,4186.0,4186.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,1.0,2.0,2.0,19.0,38.0,0.0,0.0,0.0,2022-09-01 00:19:00,2843,8008789D0,2843,8008789D0,604.51,US Dollar,604.51,US Dollar,Reinvestment,0,28438008789D0,28438008789D0
4998,4998.0,4187.0,4187.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,1.0,2.0,2.0,29.0,58.0,0.0,0.0,0.0,2022-09-01 00:29:00,1674,80087AC10,1674,80087AC10,13.93,US Dollar,13.93,US Dollar,Reinvestment,0,167480087AC10,167480087AC10


In [15]:
df_xgboost.to_csv('../datasets/mon_laun_dataset/merged_dataset.csv')

In [16]:
# Path to the file that contains financial transactions used for testing
test_transactions_path = "../datasets/graph_feature_preprocessor/aml_custom_test.txt"

print("Loading the test transactions ")
X_test = np.loadtxt(test_transactions_path, dtype=np.float64, delimiter=" ", comments="#", usecols=range(4))
print("Input dataset shape: ", X_test.shape)

df = pd.DataFrame(X_test, columns=['transactionID', 'sourceAccountID', 'destinationAccountID', 'timestamp'])
display(df)

Loading the test transactions 
Input dataset shape:  (8, 4)


Unnamed: 0,transactionID,sourceAccountID,destinationAccountID,timestamp
0,8.0,8.0,9.0,8.0
1,9.0,9.0,10.0,9.0
2,10.0,10.0,11.0,10.0
3,11.0,9.0,11.0,11.0
4,12.0,11.0,9.0,12.0
5,13.0,11.0,8.0,13.0
6,14.0,8.0,10.0,14.0
7,15.0,10.0,8.0,15.0


In [17]:
print("Creating a graph feature preprocessor ")
gp = GraphFeaturePreprocessor()

print("Setting the parameters of the graph feature preprocessor ")
gp.set_params(params)

print("Creating the graph using the training transactions ")
gp.fit(X_train)  # this step is optional, however recommended for capturing deeper graph feature

# transform can run on single transactions or on batches of transactions
print("Enriching the test transactions with new graph features ")
X_test_enriched = gp.transform(X_test.astype("float64"))
print_enriched_transaction(X_test_enriched, gp.get_params())

Creating a graph feature preprocessor 
Setting the parameters of the graph feature preprocessor 
Creating the graph using the training transactions 
Enriching the test transactions with new graph features 


Unnamed: 0,transactionID,sourceAccountID,targetAccountID,timestamp,fan_in_bins_2-3,fan_in_bins_3-inf,fan_out_bins_2-3,fan_out_bins_3-inf,degree_in_bins_2-3,degree_in_bins_3-inf,degree_out_bins_2-3,degree_out_bins_3-inf,scatter-gather_bins_2-3,scatter-gather_bins_3-inf,temp-cycle_bins_2-3,temp-cycle_bins_3-inf,source_fan_out,source_deg_out,source_ratio_out,source_avg_col3_out,source_sum_col3_out,source_var_col3_out,source_skew_col3_out,source_kurtosis_col3_out,source_fan_in,source_deg_in,source_ratio_in,source_avg_col3_in,source_sum_col3_in,source_var_col3_in,source_skew_col3_in,source_kurtosis_col3_in,dest_fan_out,dest_deg_out,dest_ratio_out,dest_avg_col3_out,dest_sum_col3_out,dest_var_col3_out,dest_skew_col3_out,dest_kurtosis_col3_out,dest_fan_in,dest_deg_in,dest_ratio_in,dest_avg_col3_in,dest_sum_col3_in,dest_var_col3_in,dest_skew_col3_in,dest_kurtosis_col3_in
0,8.0,8.0,9.0,8.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
1,9.0,9.0,10.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0
2,10.0,10.0,11.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0,3.0,3.0,1.0,21.0,42.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
3,11.0,9.0,11.0,11.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0,21.0,42.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
4,12.0,11.0,9.0,12.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,3.0,1.0,21.0,42.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
5,13.0,11.0,8.0,13.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,3.0,3.0,1.0,21.0,42.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0
6,14.0,8.0,10.0,14.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0
7,15.0,10.0,8.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.5,26.0,52.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0,3.0,3.0,1.0,16.0,16.0,0.0,0.0,0.0


Now the enriched transactions can be used as input to the ML model previously trained. 