# IMI Big Data Competition - Anti-money Laundring 
# Unsupervised classifcation approach

This script identifies high-risk money laundering bank accounts using a simple neural network and newly designed features 

In [None]:
#Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
import itertools
from sklearn import metrics
from IPython.display import clear_output
import time
import keras
from sklearn import preprocessing
import multiprocessing as mp

In [None]:
#loading an efficiant neural network 
final_model = keras.models.load_model('NN_final_models/model18')
final_model.summary()
print("Number of processors available for parallel computing: ", mp.cpu_count())

## Designing new features to be used in identifying high-risk money laundering accounts


In [None]:
def compute_trans_features(cus_group):
  by_mon = cus_group.groupby('month').sum()
  n_mon = len(cus_group['month'].unique())

  in_per_cnt_avg = np.nansum((by_mon['in_amt']/by_mon['in_cnt']).fillna(0))/13.0
  in_per_cnt_std = np.sqrt(np.nanstd((by_mon['in_amt']/by_mon['in_cnt']).fillna(0))**2 * n_mon / 13.0)
  out_per_cnt_avg = np.nansum((by_mon['out_amt']/by_mon['out_cnt']).fillna(0))/13.0
  out_per_cnt_std = np.sqrt(np.nanstd((by_mon['out_amt']/by_mon['out_cnt']).fillna(0))**2 * n_mon / 13.0)
  
  in_amt_avg = np.nansum(by_mon['in_amt'])/13.0
  in_amt_std = np.sqrt((np.nanstd(by_mon['in_amt']))**2 * n_mon / 13.0)
  out_amt_avg = np.nansum(by_mon['out_amt'])/13.0
  out_amt_std = np.sqrt((np.nanstd(by_mon['out_amt']))**2 * n_mon / 13.0)

  amt_ratio_avg = np.nansum((by_mon['in_amt']/by_mon['out_amt']).replace(np.inf,0))/13.0
  amt_ratio_std = np.sqrt(np.nanstd((by_mon['in_amt']/by_mon['out_amt']).replace(np.inf,0))**2 * n_mon / 13.0)

  in_cnt_avg = np.nansum(by_mon['in_cnt'])/13.0
  in_cnt_std = np.sqrt(np.nanstd(by_mon['in_cnt'])**2 * n_mon / 13.0)
  out_cnt_avg = np.nansum(by_mon['out_cnt'])/13.0
  out_cnt_std = np.sqrt(np.nanstd(by_mon['out_cnt'])**2 * n_mon / 13.0)

  cnt_ratio_avg = np.nansum((by_mon['in_cnt']/by_mon['out_cnt']).replace(np.inf,0))/13.0
  cnt_ratio_std = np.sqrt(np.nanstd((by_mon['in_cnt']/by_mon['out_cnt']).replace(np.inf,0))**2 * n_mon / 13.0)
  
  mon_cash_group = cus_group[cus_group['trsactn_type']=='cash'].groupby(by=['month']).sum()
  mon_cheque_group = cus_group[cus_group['trsactn_type']=='cheque'].groupby(by=['month']).sum()
  mon_visa_group = cus_group[cus_group['trsactn_type']=='visa'].groupby(by=['month']).sum()
  mon_debit_group = cus_group[cus_group['trsactn_type']=='debit'].groupby(by=['month']).sum()
  mon_amex_group = cus_group[cus_group['trsactn_type']=='amex'].groupby(by=['month']).sum()
  
  per_cash_in_amt_avg = np.nansum((mon_cash_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cash_in_amt_std = np.sqrt(np.nanstd((mon_cash_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cash_in_cnt_avg = np.nansum(mon_cash_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cash_in_cnt_std = np.sqrt(np.nanstd((mon_cash_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cash_out_amt_avg = np.nansum((mon_cash_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cash_out_amt_std = np.sqrt(np.nanstd((mon_cash_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cash_out_cnt_avg = np.nansum(mon_cash_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cash_out_cnt_std = np.sqrt(np.nanstd((mon_cash_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_cheque_in_amt_avg = np.nansum((mon_cheque_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cheque_in_amt_std = np.sqrt(np.nanstd((mon_cheque_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cheque_in_cnt_avg = np.nansum(mon_cheque_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cheque_in_cnt_std = np.sqrt(np.nanstd((mon_cheque_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cheque_out_amt_avg = np.nansum((mon_cheque_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cheque_out_amt_std = np.sqrt(np.nanstd((mon_cheque_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cheque_out_cnt_avg = np.nansum(mon_cheque_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cheque_out_cnt_std = np.sqrt(np.nanstd((mon_cheque_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_visa_in_amt_avg = np.nansum((mon_visa_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_visa_in_amt_std = np.sqrt(np.nanstd((mon_visa_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_visa_in_cnt_avg = np.nansum(mon_visa_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_visa_in_cnt_std = np.sqrt(np.nanstd((mon_visa_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_visa_out_amt_avg = np.nansum((mon_visa_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_visa_out_amt_std = np.sqrt(np.nanstd((mon_visa_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_visa_out_cnt_avg = np.nansum(mon_visa_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_visa_out_cnt_std = np.sqrt(np.nanstd((mon_visa_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_debit_in_amt_avg = np.nansum((mon_debit_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_debit_in_amt_std = np.sqrt(np.nanstd((mon_debit_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_debit_in_cnt_avg = np.nansum(mon_debit_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_debit_in_cnt_std = np.sqrt(np.nanstd((mon_debit_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_debit_out_amt_avg = np.nansum((mon_debit_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_debit_out_amt_std = np.sqrt(np.nanstd((mon_debit_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_debit_out_cnt_avg = np.nansum(mon_debit_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_debit_out_cnt_std = np.sqrt(np.nanstd((mon_debit_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_amex_in_amt_avg = np.nansum((mon_amex_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_amex_in_amt_std = np.sqrt(np.nanstd((mon_amex_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_amex_in_cnt_avg = np.nansum(mon_amex_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_amex_in_cnt_std = np.sqrt(np.nanstd((mon_amex_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_amex_out_amt_avg = np.nansum((mon_amex_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_amex_out_amt_std = np.sqrt(np.nanstd((mon_amex_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_amex_out_cnt_avg = np.nansum(mon_amex_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_amex_out_cnt_std = np.sqrt(np.nanstd((mon_amex_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  return pd.DataFrame({'in_per_cnt_avg':in_per_cnt_avg, 'in_per_cnt_std':in_per_cnt_std, 
                       'out_per_cnt_avg':out_per_cnt_avg, 'out_per_cnt_std':out_per_cnt_std,
                       'in_amt_avg':in_amt_avg, 'in_amt_std':in_amt_std, 
                       'out_amt_avg':out_amt_avg, 'out_amt_std':out_amt_std,
                       'per_cash_in_amt_avg':per_cash_in_amt_avg, 'per_cash_in_amt_std':per_cash_in_amt_std, 
                       'per_cash_in_cnt_avg':per_cash_in_cnt_avg, 'per_cash_in_cnt_std':per_cash_in_cnt_std,
                       'per_cash_out_amt_avg':per_cash_out_amt_avg, 'per_cash_out_amt_std':per_cash_out_amt_std,
                       'per_cash_out_cnt_avg':per_cash_out_cnt_avg, 'per_cash_out_cnt_std':per_cash_out_cnt_std,
                       'per_cheque_in_amt_avg':per_cheque_in_amt_avg, 'per_cheque_in_amt_std':per_cheque_in_amt_std, 
                       'per_cheque_in_cnt_avg':per_cheque_in_cnt_avg, 'per_cheque_in_cnt_std':per_cheque_in_cnt_std,
                       'per_cheque_out_amt_avg':per_cheque_out_amt_avg, 'per_cheque_out_amt_std':per_cheque_out_amt_std,
                       'per_cheque_out_cnt_avg':per_cheque_out_cnt_avg, 'per_cheque_out_cnt_std':per_cheque_out_cnt_std,
                       'per_visa_in_amt_avg':per_visa_in_amt_avg, 'per_visa_in_amt_std':per_visa_in_amt_std, 
                       'per_visa_in_cnt_avg':per_visa_in_cnt_avg, 'per_visa_in_cnt_std':per_visa_in_cnt_std,
                       'per_visa_out_amt_avg':per_visa_out_amt_avg, 'per_visa_out_amt_std':per_visa_out_amt_std,
                       'per_visa_out_cnt_avg':per_visa_out_cnt_avg, 'per_visa_out_cnt_std':per_visa_out_cnt_std,
                       'per_debit_in_amt_avg':per_debit_in_amt_avg, 'per_debit_in_amt_std':per_debit_in_amt_std, 
                       'per_debit_in_cnt_avg':per_debit_in_cnt_avg, 'per_debit_in_cnt_std':per_debit_in_cnt_std,
                       'per_debit_out_amt_avg':per_debit_out_amt_avg, 'per_debit_out_amt_std':per_debit_out_amt_std,
                       'per_debit_out_cnt_avg':per_debit_out_cnt_avg, 'per_debit_out_cnt_std':per_debit_out_cnt_std,
                       'per_amex_in_amt_avg':per_amex_in_amt_avg, 'per_amex_in_amt_std':per_amex_in_amt_std, 
                       'per_amex_in_cnt_avg':per_amex_in_cnt_avg, 'per_amex_in_cnt_std':per_amex_in_cnt_std,
                       'per_amex_out_amt_avg':per_amex_out_amt_avg, 'per_amex_out_amt_std':per_amex_out_amt_std,
                       'per_amex_out_cnt_avg':per_amex_out_cnt_avg, 'per_amex_out_cnt_std':per_amex_out_cnt_std,},index=[0])


In [None]:
## Loading the transaction file
#Ensure the training file is in the current working directory
df_trans = pd.read_parquet("bigdata2021data/transaction_train.parquet")
df_trans_by_cus = df_trans.groupby('customer_id_mskd')

In [None]:
#Computing the new features and adding them to the existing dataframe
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=np.int(mp.cpu_count()/2))
df_trans_features = df_trans_by_cus.parallel_apply(compute_trans_features)

In [None]:
#Scaling all features for use in further analyses and ensuring that using the .head() command
df_trans_features.iloc[:,:] = preprocessing.StandardScaler().fit_transform(df_trans_features)
df_trans_features.head()



In [None]:
# Getting all customer ID's from the transaction dataframe and amount of unique customers
cus_id = df_trans_features.index.unique()
len(cus_id)

In [None]:
#Loading the separate customer data file and only retriving information pertaining to 
#the customers that are in the transaction file
df_cus = pd.read_parquet('customer_training.parquet')
df_cus = df_cus.set_index('customer_id_mskd')
df_cus = df_cus.loc[cus_id,:]

In [None]:
#Dropping empty columns or columns with incomplete data
#Turning true/false values to boolean logic
#Replacing NA's with 0's
df_cus = df_cus.drop(columns=['client_type_aml','primary_ownership_flag','industry_code_aml','occupation_code_aml','jurisdiction_code',
                              'customer_status_aml','country_of_domicile_aml','PCD_LLC','PCD_MOR','PCD_SAV','PCD_SDB','PCD_TED',
                              'export_ts','PCD_CMS','PCD_MUF','SRV_FLG','SRV_FSL',
                              'SRV_FLG','SRV_FSL','SRV_ILC','SRV_LOC','SRV_NLG','SRV_NSL','SRV_TRF'])
df_cus["PCD_CDA"].fillna(0,inplace=True)
df_cus["PCD_CRC"].fillna(0,inplace=True)
df_cus['PRD_INFO_AVAIL'].replace({False:0, True:1}, inplace=True)
df_cus = df_cus.merge(pd.get_dummies(df_cus[['occupation_status_code_aml','relationship_type']]),on='customer_id_mskd')
df_cus.drop(columns=['occupation_status_code_aml','relationship_type'],inplace=True)
df_cus[['relationship_type_POWER OF ATTORNEY','relationship_type_Power of Attorney']]=0
df_cus.head()

In [None]:
#Merging customer and transaction data into one array for further analyses
df_combined = df_trans_features.merge(df_cus,on='customer_id_mskd')
df_combined.head()

## Building and testing the model using NN and Kmeans Clustering

In [None]:
from sklearn import cluster

# function to do kmeans and add data to dataframe

def do_kmeans(df, feats_use, k = 12, col_name = 'cluster'):
    # df = pandas DataFrame with numeric features
    # feats_use = list or array features columns to subset
    # modifies dataframe to contain cluster labels
    data_use = df[feats_use].to_numpy().astype('float64')
    # setup + run kmeans
    kmeans_obj = cluster.KMeans(n_clusters = k, random_state = 20)
    clust_labs = kmeans_obj.fit_predict(data_use)
    # conver clust_labs to string (categorical)
    clust_labs = clust_labs.astype('str')
    df[col_name] = clust_labs

In [None]:
# Defining the values for input into our model
X_big = df_combined.values
y_big = final_model.predict(X_big)
y_lab = y_big.argmax(axis=1)
df_lab_3 = df_combined.iloc[y_lab==2,:]

do_kmeans(df_lab_3, feats_use = df_lab_3.columns, k = 2, col_name = 'clust')

df_lab_3