In [None]:
# IMI Big Data Competition - Anti-money Laundring 
# Data Preprocessing

#This script preprocesses the AML data and output a merged data

#import required libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

## Inspect data
### Customer data set

The customer data set contains customer-level data on retail accounts for which we have labels

Columns:
* The first five columns correspond to information given by the customer
* The variable "rating" means risk of AML: 1 = low risk, 2 = medium risk, 3 = potentially high risk
* Columns beginning with “PCD” or “SRV” correspond to ‘product’ or ‘account’ information (e.g., PCD_MOR: how many accounts of type “MOR” does this customer have)

#Load the customer training data set
df_cust = pd.read_parquet("cust_train.parquet")
df_cust.head()

#See what dtypes are contained in the dataframe
df_cust.info()

#Remove columns that only contain NaNs (i.e., the number of NaNs is equal to the number of rows)
nanCols = df_cust.columns[df_cust.isna().sum()==len(df_cust)] #columns that only contain NaNs
#remove these columns
df_cust.drop(nanCols, axis=1, inplace=True) 

#Descriptive statistics to get an early idea of any potential obvious pattersn upfront
df_cust.describe()

### Transaction data set

The transaction data set contains monthly aggregated transaction data on retail accounts in the customer data set

Columns:
* "in_amt" and "out_amt": The total volume entering and exiting each product for each customer
* "in_cnt" and "out_cnt": The total count of transactions over which that volume was distributed

#load the transaction training data set:
df_trans = pd.read_parquet("transaction_train.parquet")
df_trans.head()

#See what dtypes are contained in the dataframe
df_trans.info()

#Descriptives again for the transaction data this time
df_trans.describe()

#Merging Customer and Transaction data
#Unique customer id in the customer data set
custID_cust_uniq = df_cust['customer_id_mskd'].unique()
#number of uniqe customer id in the customer training data set
print('Number of unique customer IDs in the customer data:', str(len(custID_cust_uniq)))

#Unique customer id in the transaction training data set
custID_trsact_uniq = df_trsact['customer_id_mskd'].unique()
#number of uniqe customer id in the transaction training data set
print('Number of unique customer IDs in the transaction data:', str(len(custID_trsact_uniq)))

#Remove customers without transaction data when merging data sets
df_merged = df_cust.merge(df_trans, on='customer_id_mskd')

#Confirming dtypes in the new dataframe to ensure nothing changes
df_merge.info()

#Fill NaNs with zeros for coutinuous variables: PCD_CDA, ... PCD_TED, and in_amt, in_cnt, out_amt, out_cnt
cols_cont = ['PCD_CDA','PCD_CRC','PCD_LLC','PCD_MOR','PCD_SAV','PCD_SDB','PCD_TED',
            'in_amt','in_cnt','out_amt','out_cnt']

#Check whether there is any values of zeros in the continuous variables
#If there is no errors, we can replace missing values in these columns with zeros
#If an error occurs, it means that column has at least 1 value of zero, and we need to think whether we can replace missing values with zeros
for col in cols_cont:
    assert (df_merged['PCD_CDA']==0).sum() == 0

#Make sure that there is NO case when in_amt contains a value but in_cnt is NaN (same applies to out_amt and out_cnt)
assert ~(df_trans['in_amt'].isna() ^ df_trans['in_cnt'].isna()).any()
assert ~(df_trans['out_amt'].isna() ^ df_trans['out_cnt'].isna()).any()

#Fill missing values with zeros
for col in cols_cont:
    df_merged.fillna({col: 0}, inplace=True)
    
df_merge.info()

# Save the merged data as a parquet file
df_merged.to_parquet("merged_clean_df.parquet")