# IMI Big Data Competition - Anti-money Laundring 
# Unsupervised classifcation approach

This script designs features to be input for further analysis such as PCA or linear regression

In [None]:
# import required libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os

%matplotlib inline

## Load the clean merged data
Note: customers without transaction data were removed when merging the cust_train and trsact_train data sets

Overview of variable information
* The first five columns correspond to information given by the customer
* The variable "rating" means risk of AML: 1 = low risk, 2 = medium risk, 3 = potentially high risk
* Columns beginning with “PCD” or “SRV” correspond to ‘product’ or ‘account’ information (e.g., PCD_MOR: how many accounts of type “MOR” does this customer have)
* "in_amt" and "out_amt": The total volume entering and exiting each product for each customer
* "in_cnt" and "out_cnt": The total count of transactions over which that volume was distributed

For detailed information about the clean merge data, see aml_preprocessing_script.ipynb

In [None]:
# Load the clean merge data and view the dtypes contained
rootDir = '/Users/Me/Local/Directory/'
filePath = os.path.join(rootDir,'merged_clean_df.parquet')
merged_df = pd.read_parquet(filePath)
merged_df.info()

In [None]:
#Categorical variables
cols_cat = ['jurisdiction_code','client_type_aml','occupation_code_aml','country_of_domicile_aml',
            'occupation_status_code_aml','customer_status_aml','primary_ownership_flag','relationship_type',
            'trsactn_type'] 
#Continuous variables
cols_cont = ['PCD_CDA','PCD_CRC','PCD_LLC','PCD_MOR','PCD_SAV','PCD_SDB','PCD_TED',
            'in_amt','in_cnt','out_amt','out_cnt']

In [None]:
# label the customer risk rating
lst_rating_lbl = [] #np.empty((len(df_merge),1))
for val in merged_df['rating']:
    if val==1:
        lst_rating_lbl.append('low')
    elif val==2:
        lst_rating_lbl.append('medium')
    elif val==3:
        lst_rating_lbl.append('high')

rating_lbl = pd.DataFrame(lst_rating_lbl, columns=['rating_lbl'])
merged_df = pd.concat([merged_df, rating_lbl], axis = 1)

### The following categorical variables have less than (or equal to) 3 levels across customers:
* jurisdiction_code
* client_type_aml 
• customer_status_aml
* primary_ownership_flag


In [None]:
#The grouping table shows that there could be multiple transaction data in each month for each customer
merged_df.groupby(['customer_id_mskd','month'])['rating'].count()


## Create some new variables
### For each customer, each month, each type of transaction (i.e., for each row in merge_df)
* Amount of deposit per deposit 
* Amount of withdrawl per withdrawl
* Deposit-to-withdrawl ratio
* Withdrawl-to-deposit ratio

In [None]:
print('Number of rows in the merged_df dataframe:', str(len(merged_df)), 'rows.')
# On average the amount of incoming money per incoming transaction (in_amt divided by in_cnt)
# NaN means there is no incoming transaction in that month for that customer
cond = (merged_df['in_amt']==0) & (merged_df['in_cnt']==0)
merged_df['in_frac'] = np.where(cond, np.nan, merged_df['in_amt']/merged_df['in_cnt'])
# merged_df = merged_df.assign(in_frac=merged_df['in_amt']/merged_df['in_cnt'])
# print('No incoming transaction:', str(merged_df['in_frac'].isna().sum()), 'rows')
print('Number of rows without IN transaction:', str(sum(cond)), 'rows')

# On average the amount of exiting money per exiting transaction (out_amt divided by out_cnt)
# NaN means there is no incoming transaction in that month for that customer
cond = (merged_df['out_amt']==0) & (merged_df['out_cnt']==0)
merged_df['out_frac'] = np.where(cond, np.nan, merged_df['out_amt']/merged_df['out_cnt'])
# merged_df = merged_df.assign(out_frac=merged_df['out_amt']/merged_df['out_cnt'])
# print('No exiting transaction:', str(merged_df['out_frac'].isna().sum()), 'rows.')
print('Number of rows without exiting transaction:', str(sum(cond)), 'rows')

# Incoming / existing money ratio (in_amt divided by out_amt)
cond = [(merged_df['in_amt']==0) & (merged_df['out_amt']==0),
        (merged_df['in_amt']!=0) & (merged_df['out_amt']==0),
        (merged_df['in_amt']==0) & (merged_df['out_amt']!=0),
        (merged_df['in_amt']!=0) & (merged_df['out_amt']!=0)]
choice = [np.nan, np.nan, 0, merged_df['in_amt']/merged_df['out_amt']]
merged_df['inout_ratio'] = np.select(cond,choice)
# merged_df = merged_df.assign(inout_ratio=merged_df['in_amt']/merged_df['out_amt'])
# print('There is incoming but no existing transaction:', str(np.isinf(merged_df['inout_ratio']).values.sum()), 'rows.')
print('There is IN but no OUT transaction:', sum(cond[1]), 'rows.')

# Existing / incoming money ratio (out_amt divided by in_amt)
cond = [(merged_df['out_amt']==0) & (merged_df['in_amt']==0),
        (merged_df['out_amt']!=0) & (merged_df['in_amt']==0),
        (merged_df['out_amt']==0) & (merged_df['in_amt']!=0),
        (merged_df['out_amt']!=0) & (merged_df['in_amt']!=0)]
choice = [np.nan, np.nan, 0, merged_df['out_amt']/merged_df['in_amt']]
merged_df['outin_ratio'] = np.select(cond,choice)
# merged_df = merged_df.assign(outin_ratio=merged_df['out_amt']/merged_df['in_amt'])
# print('There is exiting but no incoming transaction:', str(np.isinf(merged_df['outin_ratio']).values.sum()), 'rows.')
print('There is OUT but no IN transaction:', sum(cond[1]), 'rows.')

# Incoming / exiting count ratio (in_cnt divided by out_cnt)
cond = [(merged_df['in_cnt']==0) & (merged_df['out_cnt']==0),
        (merged_df['in_cnt']!=0) & (merged_df['out_cnt']==0),
        (merged_df['in_cnt']==0) & (merged_df['out_cnt']!=0),
        (merged_df['in_cnt']!=0) & (merged_df['out_cnt']!=0)]
choice = [np.nan, np.nan, 0, merged_df['in_cnt']/merged_df['out_cnt']]
merged_df['inout_cnt_ratio'] = np.select(cond,choice)

# Exiting / incoming count ratio (out_cnt divided by in_cnt)
cond = [(merged_df['out_cnt']==0) & (merged_df['in_cnt']==0),
        (merged_df['out_cnt']!=0) & (merged_df['in_cnt']==0),
        (merged_df['out_cnt']==0) & (merged_df['in_cnt']!=0),
        (merged_df['out_cnt']!=0) & (merged_df['in_cnt']!=0)]
choice = [np.nan, np.nan, 0, merged_df['out_cnt']/merged_df['in_cnt']]
merged_df['outin_cnt_ratio'] = np.select(cond,choice)

del cond, choice

In [None]:
merged_df['trsactn_type'].unique()

In [None]:
#Organize the data such that results each type of transaction are in columns
# create a dataframe with customer information along with the month column
# (i.e., for each customer, the cutomer information is repeated across months)
cols_sel = cols_cat.copy()
cols_add = ['month','rating','rating_lbl','customer_id_mskd']
[cols_sel.append(i) for i in cols_add]
cols_sel.remove('trans_type')

trans_each = merged_df[cols_sel].groupby(['customer_id_mskd','month']).first().reset_index()


In [None]:
#Checkng dtypes again before any kind of anaylyses
trsact_each.info()

## Sum the data across months for each type of transaction, for each customer
### (i.e., each customer will have 1 row for each transaction type in this dataframe)

In [None]:
# concatenate other customer information to this dataframe
cols_sel = cols_cat.copy()
cols_add = ['rating','rating_lbl','customer_id_mskd']
[cols_sel.append(i) for i in cols_add]

temp = merged_df[cols_sel].groupby(['customer_id_mskd','trsactn_type']).first().reset_index()

merged_year = merged_year.merge(temp, on=['customer_id_mskd','trsactn_type'], how='left')

del tmp

In [None]:
# Add the same new variables as merge_df
# On average the amount of incoming money per incoming transaction (in_amt divided by in_cnt)
# NaN means there is no incoming transaction over a year (across months) for that customer
cond = (merged_year['in_amt']==0) & (merged_year['in_cnt']==0)
merged_year['in_frac'] = np.where(cond, np.nan, merged_year['in_amt']/merged_year['in_cnt'])
# merged_year = merged_year.assign(in_frac=merged_year['in_amt']/merged_year['in_cnt'])
# print('No incoming transaction:', str(merged_year['in_frac'].isna().sum()), 'rows')
print('Number of rows without incoming transaction:', str(sum(cond)), 'rows')

# On average the amount of exiting money per exiting transaction (out_amt divided by out_cnt)
# NaN means there is no exiting transaction over a year (across months) for that customer
cond = (merged_year['out_amt']==0) & (merged_year['out_cnt']==0)
merged_year['out_frac'] = np.where(cond, np.nan, merged_year['out_amt']/merged_year['out_cnt'])
# merged_year = merged_year.assign(out_frac=merged_year['out_amt']/merged_year['out_cnt'])
# print('No exiting transaction:', str(merged_year['out_frac'].isna().sum()), 'rows.')
print('Number of rows without exiting transaction:', str(sum(cond)), 'rows')

# Incoming / existing money ration (in_amt divided by out_amt)
cond = [(merged_year['in_amt']==0) & (merged_year['out_amt']==0),
        (merged_year['in_amt']!=0) & (merged_year['out_amt']==0),
        (merged_year['in_amt']==0) & (merged_year['out_amt']!=0),
        (merged_year['in_amt']!=0) & (merged_year['out_amt']!=0)]
choice = [np.nan, np.nan, 0, merged_year['in_amt']/merged_year['out_amt']]
merged_year['inout_ratio'] = np.select(cond,choice)
# merged_year = merged_year.assign(inout_ratio=merged_year['in_amt']/merged_year['out_amt'])
# print('There is incoming but no existing transaction:', str(np.isinf(merged_year['inout_ratio']).values.sum()), 'rows.')
print('There is incoming but no existing transaction:', sum(cond[1]), 'rows.')

# Existing / incoming money ration (out_amt divided by in_amt)
cond = [(merged_year['out_amt']==0) & (merged_year['in_amt']==0),
        (merged_year['out_amt']!=0) & (merged_year['in_amt']==0),
        (merged_year['out_amt']==0) & (merged_year['in_amt']!=0),
        (merged_year['out_amt']!=0) & (merged_year['in_amt']!=0)]
choice = [np.nan, np.nan, 0, merged_year['out_amt']/merged_year['in_amt']]
merged_year['outin_ratio'] = np.select(cond,choice)
# merged_year = merged_year.assign(outin_ratio=merged_year['out_amt']/merged_year['in_amt'])
# print('There is exiting but no incoming transaction:', str(np.isinf(merged_year['outin_ratio']).values.sum()), 'rows.')
print('There is exiting but no incoming transaction:', sum(cond[1]), 'rows.')

# Incoming / exiting count ratio (in_cnt divided by out_cnt)
cond = [(merged_year['in_cnt']==0) & (merged_year['out_cnt']==0),
        (merged_year['in_cnt']!=0) & (merged_year['out_cnt']==0),
        (merged_year['in_cnt']==0) & (merged_year['out_cnt']!=0),
        (merged_year['in_cnt']!=0) & (merged_year['out_cnt']!=0)]
choice = [np.nan, np.nan, 0, merged_year['in_cnt']/merged_year['out_cnt']]
merged_year['inout_cnt_ratio'] = np.select(cond,choice)

# Exiting / incoming count ratio (out_cnt divided by in_cnt)
cond = [(merged_year['out_cnt']==0) & (merged_year['in_cnt']==0),
        (merged_year['out_cnt']!=0) & (merged_year['in_cnt']==0),
        (merged_year['out_cnt']==0) & (merged_year['in_cnt']!=0),
        (merged_year['out_cnt']!=0) & (merged_year['in_cnt']!=0)]
choice = [np.nan, np.nan, 0, merged_year['out_cnt']/merged_year['in_cnt']]
merged_year['outin_cnt_ratio'] = np.select(cond,choice)

del cond, choice

## Organize the yearly data such that results each type of transaction are in columns

In [None]:
# create a dataframe with customer information 
cols_sel = cols_cat.copy()
cols_add = ['rating','rating_lbl','customer_id_mskd']
[cols_sel.append(i) for i in cols_add]
cols_sel.remove('trsactn_type')

trans_each_year = merged_year[cols_sel].groupby(['customer_id_mskd']).first().reset_index()

In [None]:
# columns that are related to transaction
cols_trans = cols_cont.copy()
cols_add = ['in_frac','out_frac','inout_ratio','outin_ratio','inout_cnt_ratio','outin_cnt_ratio']
[cols_trans.append(i) for i in cols_add]

for col in cols_trans: # go through each column related to transaction
    # convert to wide format for a given column
    temp = pd.pivot_table(merge_yr, index=['customer_id_mskd'], columns='trans_type', values=col).reset_index()
    keep_same = {'customer_id_mskd'}
    tmp.columns = ['{}{}'.format(c, '' if c in keep_same else ('_'+col)) for c in tmp.columns]

    # merge the wide format dataframe to the customer information dataframe
    trans_each_year = trans_each_year.merge(temp, on=['customer_id_mskd'], how='outer')

    del temp

## Calculate the average and standard deviation of deposit and withdrawl trasactions across months and types of transactions


In [None]:
# Load the raw data sets again b/c the missing values were not filled with zeros
df_cust = pd.read_parquet("cust_train.parquet")
df_trans = pd.read_parquet("transaction_train.parquet")
df_merged = df_cust.merge(df_trans, on='customer_id_mskd')
# df_merge.info()

avg_std_year = pd.DataFrame(df_merge['customer_id_mskd'].unique()).rename(columns={0: 'customer_id_mskd'})

In [None]:
cols_sel = ['PCD_CDA','PCD_CRC','PCD_LLC','PCD_MOR','PCD_SAV','PCD_SDB','PCD_TED',
            'in_amt','in_cnt','out_amt','out_cnt']
for col in cols_sel:
    temp = df_merged.groupby('customer_id_mskd').agg({
            col: [np.nansum, np.nanmean, np.nanstd]
        }).droplevel(0, axis=1).add_prefix((col+'_')).reset_index()
    avg_std_year = avg_std_year.merge(tmp, on=['customer_id_mskd'], how='outer')
    
    del tmp
avg_std_year.info()

## Calculate the amount of money per transaction for deposit and withdrawl across months and transaction types.

## Calculate the deposit-to-withdrawl ratio, the withdrawl-to-deposit ratio as well as the ratios for counts.

In [None]:
# On average the amount of IN money per IN transaction (in_amt_nansum divided by in_cnt_nansum)
# NaN means there is no IN transaction over a year (across months) for that customer
cond = (avg_std_year['in_amt_nansum']==0) & (avg_std_year['in_cnt_nansum']==0)
avg_std_year['in_frac'] = np.where(cond, np.nan, avg_std_year['in_amt_nansum']/avg_std_year['in_cnt_nansum'])

# On average the amount of OUT money per OUT transaction (out_amt_nansum divided by out_cnt_nansum)
# NaN means there is no OUT transaction over a year (across months) for that customer
cond = (avg_std_year['out_amt_nansum']==0) & (avg_std_year['out_cnt_nansum']==0)
avg_std_year['out_frac'] = np.where(cond, np.nan, avg_std_year['out_amt_nansum']/avg_std_year['out_cnt_nansum'])


# IN / OUT money ration (in_amt_nansum divided by out_amt_nansum)
cond = [(avg_std_year['in_amt_nansum']==0) & (avg_std_year['out_amt_nansum']==0),
        (avg_std_year['in_amt_nansum']!=0) & (avg_std_year['out_amt_nansum']==0),
        (avg_std_year['in_amt_nansum']==0) & (avg_std_year['out_amt_nansum']!=0),
        (avg_std_year['in_amt_nansum']!=0) & (avg_std_year['out_amt_nansum']!=0)]
choice = [np.nan, np.nan, 0, avg_std_year['in_amt_nansum']/avg_std_year['out_amt_nansum']]
avg_std_year['inout_ratio'] = np.select(cond,choice)


# OUT / IN money ration (out_amt_nansum divided by in_amt_nansum)
cond = [(avg_std_year['out_amt_nansum']==0) & (avg_std_year['in_amt_nansum']==0),
        (avg_std_year['out_amt_nansum']!=0) & (avg_std_year['in_amt_nansum']==0),
        (avg_std_year['out_amt_nansum']==0) & (avg_std_year['in_amt_nansum']!=0),
        (avg_std_year['out_amt_nansum']!=0) & (avg_std_year['in_amt_nansum']!=0)]
choice = [np.nan, np.nan, 0, avg_std_year['out_amt_nansum']/avg_std_year['in_amt_nansum']]
avg_std_year['outin_ratio'] = np.select(cond,choice)


# IN / OUT count ratio (in_cnt_nansum divided by out_cnt_nansum)
cond = [(avg_std_year['in_cnt_nansum']==0) & (avg_std_year['out_cnt_nansum']==0),
        (avg_std_year['in_cnt_nansum']!=0) & (avg_std_year['out_cnt_nansum']==0),
        (avg_std_year['in_cnt_nansum']==0) & (avg_std_year['out_cnt_nansum']!=0),
        (avg_std_year['in_cnt_nansum']!=0) & (avg_std_year['out_cnt_nansum']!=0)]
choice = [np.nan, np.nan, 0, avg_std_year['in_cnt_nansum']/avg_std_year['out_cnt_nansum']]
avg_std_year['inout_cnt_ratio'] = np.select(cond,choice)

# OUT / IN count ratio (out_cnt_nansum divided by in_cnt_nansum)
cond = [(avg_std_year['out_cnt_nansum']==0) & (avg_std_year['in_cnt_nansum']==0),
        (avg_std_year['out_cnt_nansum']!=0) & (avg_std_year['in_cnt_nansum']==0),
        (avg_std_year['out_cnt_nansum']==0) & (avg_std_year['in_cnt_nansum']!=0),
        (avg_std_year['out_cnt_nansum']!=0) & (avg_std_year['in_cnt_nansum']!=0)]
choice = [np.nan, np.nan, 0, avg_std_year['out_cnt_nansum']/avg_std_year['in_cnt_nansum']]
avg_std_year['outin_cnt_ratio'] = np.select(cond,choice)

## Calculate the average and standard deviation of deposit and withdrawl trasactions across months for each type of transaction

In [None]:
cols_sel = ['in_amt','in_cnt','out_amt','out_cnt']
for col in cols_sel:
    temp = df_merged.groupby(['customer_id_mskd','trsactn_type']).agg({col: [np.nanmean, np.nanstd]}).droplevel(0, axis=1).add_prefix((col+'_')).reset_index()
    
    temp_col = temp.columns.to_list()
    col_exclude = ['customer_id_mskd','trsactn_type']
    for c1 in temp.columns.to_list():
        if c1 in col_exclude:
            temp_col.remove(c1)

    for col1 in temp_col:
        temp1 = pd.pivot_table(temp, index=['customer_id_mskd'], columns='trsactn_type', values=col1).reset_index()
        keep_same = {'customer_id_mskd'}
        temp1.columns = ['{}{}'.format(c, '' if c in keep_same else ('_'+col1)) for c in temp1.columns]

        # merge the wide format dataframe to the customer information dataframe
        avg_std_each_trans_year = avg_std_each_trans_year.merge(temp1, on=['customer_id_mskd'], how='outer')

        del temp1

    del temp

## Combine the yearly data that separate columns for different types of transactions with the yearly data of the average and standard deviation results


In [None]:
trans_each_year_all = trsact_each_yr.merge(avg_std_year, on='customer_id_mskd', how='inner')
trans_each_year_all = trans_each_year_all.merge(avg_std_each_trans_year, on='customer_id_mskd', how='inner')
trans_each_year_all.info()

In [None]:
# Save this as a parquet file
trans_each_year_all.to_parquet("trans_each_yr.parquet")