In [115]:
#Import all required libraries
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, average_precision_score,recall_score,precision_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import joblib


In [116]:
#Import Dataset with node2vec derived embeddings
path_X_train='./X_train_with_embeddings.csv'
path_X_test='./X_test_with_embeddings.csv'
path_y_train='./sample_y_train.csv'
path_y_test='./sample_y_test.csv'

X_trainval = pd.read_csv(path_X_train)
y_trainval = pd.read_csv(path_y_train)
X_test = pd.read_csv(path_X_test)
y_test = pd.read_csv(path_y_test)


In [117]:
#Preprocess the 3 columns that are categorical:"Receiving Currency","Payment Currency","Payment Format"
#and One-hot encode them
X_trainval = X_trainval.dropna(axis=0)
X_trainval = X_trainval.drop(["Unnamed: 0"], axis=1)


#Preprocess the 3 columns that are required to be ont hot-coded
to_encode = ["Receiving Currency","Payment Currency","Payment Format"]
lab = preprocessing.OneHotEncoder()
for i in to_encode:
  data = lab.fit_transform(X_trainval[[i]]).toarray()   
  temp = pd.DataFrame(data,columns=lab.categories_[0]).add_prefix(i+"_")
  X_trainval = pd.concat([X_trainval, temp], axis=1)
  X_trainval = X_trainval.drop(i,axis=1)
X_trainval

Unnamed: 0,Amount Received,Amount Paid,1,2,3,4,5,6,7,8,...,Payment Currency_US Dollar,Payment Currency_Yen,Payment Currency_Yuan,Payment Format_ACH,Payment Format_Bitcoin,Payment Format_Cash,Payment Format_Cheque,Payment Format_Credit Card,Payment Format_Reinvestment,Payment Format_Wire
0,1.686000e+01,1.686000e+01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.582900e+02,1.582900e+02,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.141542e+06,1.141542e+06,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.001000e+01,2.001000e+01,-0.276856,0.106236,0.093537,0.359779,0.043401,0.189273,-0.205203,0.094851,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.636000e+01,1.636000e+01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108016,8.119580e+03,8.119580e+03,-1.069545,0.022971,0.221335,0.318252,-0.161901,0.695500,-0.300227,-0.101233,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
108017,1.407930e+03,1.407930e+03,-0.368204,-0.268510,-0.002042,-0.205000,-0.810714,0.238681,0.097748,-0.569226,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
108018,1.654137e+05,1.654137e+05,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
108019,2.730550e-01,2.730550e-01,-0.634055,-0.037375,0.013376,-0.366108,-1.192737,0.391881,-0.020475,-0.369672,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [118]:
#Preprocess the 3 columns that are categorical:"Receiving Currency","Payment Currency","Payment Format"
#and One-hot encode them
X_test = X_test.dropna(axis=0)
X_test = X_test.drop(["Unnamed: 0"], axis=1)


#Preprocess the 3 columns that are required to be one hot-coded
to_encode = ["Receiving Currency","Payment Currency","Payment Format"]
lab = preprocessing.OneHotEncoder()
for i in to_encode:
  data = lab.fit_transform(X_test[[i]]).toarray()   
  temp = pd.DataFrame(data,columns=lab.categories_[0]).add_prefix(i+"_")
  X_test = pd.concat([X_test, temp], axis=1)
  X_test = X_test.drop(i,axis=1)
X_test

Unnamed: 0,Amount Received,Amount Paid,1,2,3,4,5,6,7,8,...,Payment Currency_US Dollar,Payment Currency_Yen,Payment Currency_Yuan,Payment Format_ACH,Payment Format_Bitcoin,Payment Format_Cash,Payment Format_Cheque,Payment Format_Credit Card,Payment Format_Reinvestment,Payment Format_Wire
0,4497.340000,4497.340000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,354.530000,354.530000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,328.610000,328.610000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,26737.270000,26737.270000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,552.900000,552.900000,-0.157684,-0.258623,0.117073,0.207246,0.447201,0.212264,0.160194,0.193124,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11998,445780.530000,445780.530000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11999,149.770000,149.770000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12000,765.810000,765.810000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12001,0.022322,0.022322,0.245446,-0.431915,0.154039,0.280300,-0.308729,-0.091115,0.364544,-0.400463,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [119]:
#
y_trainval = y_trainval.dropna(axis=0)
y_trainval = y_trainval.drop(["Unnamed: 0"], axis=1)
y_trainval


Unnamed: 0,Is Laundering
0,0
1,0
2,0
3,0
4,0
...,...
108016,0
108017,0
108018,0
108019,0


In [120]:

y_test = y_test.dropna(axis=0)
y_test = y_test.drop(["Unnamed: 0"], axis=1)
y_test


Unnamed: 0,Is Laundering
0,0
1,0
2,0
3,0
4,0
...,...
11998,0
11999,0
12000,0
12001,0


In [121]:
y_trainval = np.ravel(y_trainval,order='C')


In [122]:
y_test = np.ravel(y_test,order='C')

In [123]:
##Create Undersampler to deal with the lack of data in label 1 and conduct undersampling
rus = RandomUnderSampler(random_state=13)

##Create Stratified k-fold for GridSearchCV
skf = StratifiedKFold(n_splits=10, shuffle= True, random_state=13)

##Resample training and test dataset respectively to prevent Data Leaka
X_trainval, y_trainval = rus.fit_resample(X_trainval, y_trainval)

#Create Custom Weighted Scorer
def weighted_accuracy(y_true, y_pred):
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return 0.7 * recall + 0.3 * precision

scoring = make_scorer(weighted_accuracy)
dt = GradientBoostingClassifier(random_state=13)

In [87]:
#Run GridSearchCV with parameters the same as Improved Model

learning_rates = [0.01,1]
n_estimators = [10,50,250,1000]
tree_d = range(2,11,2)
min_sam_leaves = [5,10]
param_g = {
            "max_depth":tree_d,
           "learning_rate":learning_rates,
           "n_estimators":n_estimators,
           "min_samples_leaf":min_sam_leaves,
           }
#Create a DecisionTreeClassfier and optimize the hyperparameters with GridSearchCV - auprc
dt_model_auprc_proper = GridSearchCV(dt,param_grid=param_g, cv=skf, refit=True,scoring=scoring,verbose=2,n_jobs=-1).fit(X_trainval, y_trainval)


Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [88]:
#Print out relevant optimized hyperparameter values and its relevant statistics - auprc
'The optimal hyperparameters chosen are ' + str(dt_model_auprc_proper.best_params_)

"The optimal hyperparameters chosen are {'learning_rate': 0.01, 'max_depth': 2, 'min_samples_leaf': 10, 'n_estimators': 250}"

In [89]:
"The best AUPRC results over the training/validation dataset using 10-fold CV is " + str(dt_model_auprc_proper.best_score_)

'The best AUPRC results over the training/validation dataset using 10-fold CV is 0.8664722222222221'

In [90]:
"The best AUPRC results over the test dataset is " + str(dt_model_auprc_proper.score(X_test, y_test))

'The best AUPRC results over the test dataset is 0.7012587412587412'

In [124]:
## We use feature selection to reduce dimensionality and choose the top 10 features with the highest mutual information
## This is to account for the lack of the data that we have in our training set after accounting for stratification and avoid Curse
## of dimensionality


from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X_trainval, y_trainval,random_state=13)
mutual_info

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_trainval.columns

no_of_top_features = 10
mutual_info.sort_values(ascending=False).head(no_of_top_features)



Payment Format_ACH               0.240775
73                               0.093823
Payment Format_Cash              0.092789
95                               0.077613
Payment Format_Cheque            0.077349
Payment Currency_Bitcoin         0.072120
Payment Currency_Mexican Peso    0.065968
Amount Received                  0.065841
72                               0.065620
Amount Paid                      0.063817
dtype: float64

In [125]:
top_n_most_mi = pd.DataFrame(mutual_info.sort_values(ascending=False).head(no_of_top_features)).index.values.tolist()

In [126]:
X_trainval = X_trainval[top_n_most_mi]
X_trainval

Unnamed: 0,Payment Format_ACH,73,Payment Format_Cash,95,Payment Format_Cheque,Payment Currency_Bitcoin,Payment Currency_Mexican Peso,Amount Received,72,Amount Paid
0,0.0,-0.187922,0.0,-0.167515,0.0,0.0,0.0,4.282400e+03,-0.219089,4.282400e+03
1,0.0,0.000000,0.0,0.000000,1.0,0.0,0.0,3.163900e+02,0.000000,3.163900e+02
2,0.0,-0.099509,0.0,-0.522903,1.0,0.0,0.0,1.056937e+06,-0.073222,1.056937e+06
3,0.0,0.000000,0.0,0.000000,0.0,1.0,0.0,2.339700e-02,0.000000,2.339700e-02
4,0.0,-0.856196,0.0,-0.169570,0.0,0.0,0.0,2.669410e+03,0.084006,2.669410e+03
...,...,...,...,...,...,...,...,...,...,...
157,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.561200e+02,0.000000,2.561200e+02
158,1.0,-0.691224,0.0,-0.071385,0.0,0.0,0.0,1.316780e+03,-0.383290,1.316780e+03
159,1.0,-0.822063,0.0,-0.127975,0.0,0.0,0.0,1.310814e+04,-0.473795,1.310814e+04
160,0.0,-0.373186,0.0,0.087501,1.0,0.0,0.0,1.825860e+03,-0.167964,1.825860e+03


In [127]:
X_test = X_test[top_n_most_mi]
X_test

Unnamed: 0,Payment Format_ACH,73,Payment Format_Cash,95,Payment Format_Cheque,Payment Currency_Bitcoin,Payment Currency_Mexican Peso,Amount Received,72,Amount Paid
0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,4497.340000,0.000000,4497.340000
1,0.0,0.000000,0.0,0.000000,1.0,0.0,0.0,354.530000,0.000000,354.530000
2,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,328.610000,0.000000,328.610000
3,0.0,0.000000,0.0,0.000000,1.0,0.0,0.0,26737.270000,0.000000,26737.270000
4,0.0,-0.329472,0.0,0.198115,1.0,0.0,0.0,552.900000,-0.231712,552.900000
...,...,...,...,...,...,...,...,...,...,...
11998,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,445780.530000,0.000000,445780.530000
11999,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,149.770000,0.000000,149.770000
12000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,765.810000,0.000000,765.810000
12001,0.0,-0.307914,0.0,-0.351875,0.0,1.0,0.0,0.022322,-0.396015,0.022322


In [128]:
#Run GridSearchCV with parameters the same as Improved Model
learning_rates = [0.01,1]
n_estimators = [10,50,250,1000]
tree_d = range(2,11,2)
min_sam_leaves = [5,10]
param_g = {
            "max_depth":tree_d,
           "learning_rate":learning_rates,
           "n_estimators":n_estimators,
           "min_samples_leaf":min_sam_leaves,
           }
#Create a DecisionTreeClassfier and optimize the hyperparameters with GridSearchCV - auprc
dt_model_auprc_proper = GridSearchCV(dt,param_grid=param_g, cv=skf, refit=True,scoring=scoring,verbose=2,n_jobs=-1).fit(X_trainval, y_trainval)


Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [129]:
#Print out relevant optimized hyperparameter values and its relevant statistics - auprc
'The optimal hyperparameters chosen are ' + str(dt_model_auprc_proper.best_params_)

"The optimal hyperparameters chosen are {'learning_rate': 0.01, 'max_depth': 6, 'min_samples_leaf': 5, 'n_estimators': 10}"

In [130]:
"The best AUPRC results over the training/validation dataset using 10-fold CV is " + str(dt_model_auprc_proper.best_score_)

'The best AUPRC results over the training/validation dataset using 10-fold CV is 0.8483809523809522'

In [131]:
"The best AUPRC results over the test dataset is " + str(dt_model_auprc_proper.score(X_test, y_test))

'The best AUPRC results over the test dataset is 0.5843656802018812'

In [99]:
data = {'Optimal Hyperparemater':str(dt_model_auprc_proper.best_params_), 'Best AUPRC Results over the Training/Validation dataset':dt_model_auprc_proper.best_score_ ,'Best AUPRC Results over the Test dataset':dt_model_auprc_proper.score(X_test, y_test)}
  
# Creates pandas DataFrame.  
result = pd.DataFrame(data, index =['1'])  
result

Unnamed: 0,Optimal Hyperparemater,Best AUPRC Results over the Training/Validation dataset,Best AUPRC Results over the Test dataset
1,"{'learning_rate': 0.01, 'max_depth': 6, 'min_s...",0.848381,0.584366
