# IMI Big Data Competition - Anti-money Laundring 
# Unsupervised classifcation approach

This script computes different models of unsupervised learning such as PCA, Clustering, and KMeans

## PCA

Apply PCA on the continuous variables

In [None]:
#Import required libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
from sklearn import preprocessing, decomposition

%matplotlib inline


In [None]:
#Load the clean merge data and view the dtypes contained
rootDir = '/Users/Me/Local/Directory/'
filePath = os.path.join(rootDir,'trans_each_yr.parquet')
trans_each_year_all = pd.read_parquet(filePath)
trans_each_year_all.info()

In [None]:
#Extract the continuous variables (features fed into PCA)
merge_cont = merged_df[cols_cont]

#Standardize the features
scaler = preprocessing.StandardScaler()
temp = scaler.fit_transform(merge_cont)
merge_cont_zscored = pd.DataFrame(data=tmp, columns=cols_cont)
del temp


In [None]:
#Setup the PCA object applied on all continuous variables
pca_cont = decomposition.PCA().fit(merge_cont_zscored)

In [None]:
#Explained variance and cumulative explained variance for each PC
pcs_plot = np.arange(0,len(cols_cont))
fig, (ax1,ax2) = plt.subplots(1,2, sharex=True, figsize=(10,5))
ax1.plot(pcs_plot, pca_cont.explained_variance_ratio_[pcs_plot])
ax1.set_xticks(pcs_plot)
ax1.set_xlabel('PC')
ax1.set_ylabel('Fraction Variance Explained')

ax2.plot(np.cumsum(pca_cont.explained_variance_ratio_[pcs_plot]))
#ax2.xticks(pcs_plot)`
ax2.set_xlabel('PC')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.grid()

plt.show()

##Visualize loadings of the first few PCs


In [None]:
nPCs_ttl = len(pca_cont.components_) #number of PCs in total
nPCs = 4 #first n PCs to be selected

#Get loadings for PC, i.e. association of input features with each PC. Do for PC0, PC1, PC2, PC3
loadings_mat = pca_cont.components_[0:nPCs, :]
#4 components x 11 features
loadings_mat.shape

cols_pc = ['PC'+str(i) for i in range(nPCs_ttl)]
loadings_df = pd.DataFrame(data = np.transpose(loadings_mat), 
                           index = cols_cont, 
                           columns = cols_pc[0:nPCs])
loadings_df.shape
sns.heatmap(loadings_df, xticklabels = True, yticklabels = True)

In [None]:
##Ranking features for PC's 0-3
loadings_df['PC0'].sort_values(ascending = False)
loadings_df['PC1'].sort_values(ascending = False)
loadings_df['PC2'].sort_values(ascending = False)
loadings_df['PC3'].sort_values(ascending = False)

In [None]:
##Plot features and visualize both the distribution fo data and association of features with data

#PCA coordinates derived from standardized data
pca_cont_coords = pca_cont.fit_transform(merge_cont_zscored)#[:, 0:nPCs]
pca_cont_coords = pd.DataFrame(data = pca_cont_coords, columns = cols_pc)
pca_cont_coords

#Combine PC coordinates with the continuous variables as well as customer ratings (target)
merge_cont_pcs = pd.concat([merge_cont, merged_df['rating'], merged_df['rating_lbl'], pca_cont_coords], axis = 1)
merge_cont_pcs

In [None]:
def plot2D_pcs(df, pc_x, pc_y, col_target, targets, colors):
    '''
    Plot a 2D scatter plot of PC scores
    Inputs:
        df - a dataframe containing the PC scores and the target column
        pc_x - the name of the PC to be plotted on x-axis (needs to match the column name in df)
        pc_y - the name of the PC to be plotted on y-axis (needs to match the column name in df)
        col_target - column name of the target (e.g., a cateogrical variable) in df
        targets - a list of unique values in the target column
        colors - a list of colors to be used for each unique value in the target column
    '''
    fig, ax = plt.subplots()
    ax.set_xlabel(pc_x)
    ax.set_ylabel(pc_y)
    for target, color in zip(targets,colors):
        idxToKeep = df[col_target] == target
        ax.scatter(df.loc[idxToKeep, pc_x], df.loc[idxToKeep, pc_y],
                   c = color,
                   alpha=0.3)
    ax.legend(targets)
    
    plt.show()

In [None]:
##Plot PC0 and PC1
plot2D_pcs(df=merge_cont_pcs, pc_x='PC0', pc_y='PC1', 
           col_target='rating_lbl', 
           targets=['low', 'medium', 'high'],
           colors=['green','blue','red'])

In [None]:
##Plot PC2 and PC3
plot2D_pcs(df=merge_cont_pcs, pc_x='PC2', pc_y='PC3', 
           col_target='rating_lbl', 
           targets=['low', 'medium', 'high'],
           colors=['green','blue','red'])

In [None]:
##Plot PC4 and PC5
plot2D_pcs(df=merge_cont_pcs, pc_x='PC4', pc_y='PC5', 
           col_target='rating_lbl', 
           targets=['low', 'medium', 'high'],
           colors=['green','blue','red'])

## Clustering Analysis


In [None]:
#Importing packages
from sklearn import cluster, metrics, preprocessing

In [None]:
def do_kmeans(df, feats_use, k = 2, scale = 'yes'): #, col_name = 'cluster'
    #df = pandas DataFrame with numeric features
    #feats_use = list or array features columns to subset
    #modifies dataframe to contain cluster labels
    
    #if the features in df are not standardized yet
    if scale=='yes':
        df_raw = df.copy()
        scaler = preprocessing.StandardScaler()
        tmp = scaler.fit_transform(df[feats_use])
        df = pd.DataFrame(data=tmp, columns=feats_use)
        del tmp
        
    data_use = df[feats_use].to_numpy().astype('float64')   
    kmeans_obj = cluster.KMeans(init='k-means++', n_clusters = k, random_state = 42)
    clust_labs = kmeans_obj.fit_predict(data_use)
    clust_labs = clust_labs.astype('str')
    
    return clust_labs


### K-Means (k-means++) on the following variables (standardized) in merge_df
* in_frac, in_cnt
* out_frac, out_cnt

In [None]:
#Features to be used
cols_feat = ['in_frac','in_cnt','out_frac','out_cnt']

#Remove rows with NaN in in_frac and out_frac
merged_df_frac = merged_df.dropna(subset=['in_frac','out_frac'])

#Do clustering
nClusters = 3
clust3_lbl_frac = do_kmeans(merged_df_frac, feats_use=cols_feat, k=nClusters)

clust3_lbl_frac_df = pd.DataFrame(data=clust3_lbl_frac,
                                  index=merged_df_frac.index).rename(columns={0:'clust3_lbl'})

In [None]:
#calculate the silhouette score (higher means better clustering performance)
s_score = metrics.silhouette_score(merges_df_frac[cols_feat], clust3_lbl_frac, metric="euclidean")
print('Number of clusters =', str(nClusters), ': Silhouette score =', str(s_score))

In [None]:
##Plot Count of IN vs. IN money per transaction
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Count of Deposit') 
ax1.set_ylabel('Deposit Money per Transaction') 
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_frac_df['clust3_lbl']==target
    ax1.scatter(merge_df_frac.loc[indicesToKeep, 'in_cnt']
               , merge_df_frac.loc[indicesToKeep, 'in_frac']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)

#risk rating
ax2.set_xlabel('Count of Deposit') 
ax2.set_ylabel('Deposit Money per Transaction') 
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_df_frac['rating_lbl'] == target
    ax2.scatter(merge_df_frac.loc[indicesToKeep, 'in_cnt']
               , merge_df_frac.loc[indicesToKeep, 'in_frac']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()

In [None]:
#Plot Count of OUT vs. OUT money per transaction
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Count of Withdrawl') 
ax1.set_ylabel('Withdrawl Money per Transaction') 
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_frac_df['clust3_lbl']==target
    ax1.scatter(merge_df_frac.loc[indicesToKeep, 'out_cnt']
               , merge_df_frac.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)

#risk rating
ax2.set_xlabel('Count of Withdrawl') 
ax2.set_ylabel('Withdrawl Money per Transaction') 
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_df_frac['rating_lbl'] == target
    ax2.scatter(merge_df_frac.loc[indicesToKeep, 'out_cnt']
               , merge_df_frac.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()

### K-Means (k-means++) on all continuous variables (standardized) in merge_df

In [None]:
#Clustering
nClusters = 3
clust3_lbl = do_kmeans(merge_df, feats_use=cols_cont, k=nClusters)

In [None]:
#Calculate the silhouette score (higher means better clustering performance)
s_score = metrics.silhouette_score(merge_cont_zscored[cols_cont], clust3_lbl, metric="euclidean")
print('Number of clusters =', str(nClusters), ': Silhouette score =', str(s_score))

In [None]:
#Compare the clustering labels and the customer risk rating labels
clust3_lbl_df = pd.DataFrame(data=clust3_lbl).rename(columns={0:'clust3_lbl'})
clust3_rating = pd.concat([clust3_lbl_df, merge_df['rating_lbl']], axis=1)
clust3_rating.groupby(['clust3_lbl','rating_lbl']).size().reset_index()

In [None]:
#Plot the K-Means (3 clusters) results along the first PC0 and PC1
#Also plot the rating labels just for comparison
sel_pcs = [0,1]

fig, (ax1, ax2) = plt.subplots(1,2,sharex=True, sharey=True, figsize=(10,5))
ax1.set_xlabel(cols_pc[sel_pcs[0]])
ax1.set_ylabel(cols_pc[sel_pcs[1]])
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_df['clust3_lbl']==target #merge_cont_zscored['clustLbls_k3'] == target
    ax1.scatter(merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[0]]]
               , merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[1]]]
               , c = color
               , alpha=0.3)
ax1.legend(targets)

ax2.set_xlabel(cols_pc[sel_pcs[0]])
ax2.set_ylabel(cols_pc[sel_pcs[1]])
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_cont_pcs['rating_lbl'] == target
    ax2.scatter(merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[0]]]
               , merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[1]]]
               , c = color
               , alpha=0.3)
ax2.legend(targets)

plt.show()

In [None]:
#Plot the K-Means (3 clusters) results along PC2 and PC3
#Also plot the rating labels just for comparison
sel_pcs = [2,3]

fig, (ax1, ax2) = plt.subplots(1,2,sharex=True, sharey=True, figsize=(10,5))
ax1.set_xlabel(cols_pc[sel_pcs[0]])
ax1.set_ylabel(cols_pc[sel_pcs[1]])
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_df['clust3_lbl']==target #merge_cont_zscored['clustLbls_k3'] == target
    ax1.scatter(merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[0]]]
               , merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[1]]]
               , c = color
               , alpha=0.3)
ax1.legend(targets)

ax2.set_xlabel(cols_pc[sel_pcs[0]])
ax2.set_ylabel(cols_pc[sel_pcs[1]])
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_cont_pcs['rating_lbl'] == target
    ax2.scatter(merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[0]]]
               , merge_cont_pcs.loc[indicesToKeep, cols_pc[sel_pcs[1]]]
               , c = color
               , alpha=0.3)
ax2.legend(targets)

plt.show()

## K-Means on the data summed across months (No matter what types of transactions)
### K-Means (k-means++) on the following variables (standardized) in merge_yr
* in_frac, in_cnt
* out_frac, out_cnt

In [None]:
#Features to be used
cols_feat = ['in_frac','in_cnt','out_frac','out_cnt']

#Remove rows with NaN in in_frac and out_frac
merge_year_frac = merge_year.dropna(subset=['in_frac','out_frac'])

#Do clustering
nClusters = 3
clust3_lbl_year_frac = do_kmeans(merge_year_frac, feats_use=cols_feat, k=nClusters)
clust3_lbl_year_frac_df = pd.DataFrame(data=clust3_lbl_year_frac,index=merge_year_frac.index).rename(columns={0:'clust3_lbl'})

In [None]:
#calculate the silhouette score (higher means better clustering performance)
s_score = metrics.silhouette_score(merge_yr_frac[cols_feat], clust3_lbl_yr_frac, metric="euclidean")
print('Number of clusters =', str(nClusters), ': Silhouette score =', str(s_score))

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Deposit Money per Transaction') #in_frac
ax1.set_ylabel('Withdrawl Money per Transaction') #out_frac
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_year_frac_df['clust3_lbl']==target
    ax1.scatter(merge_year_frac.loc[indicesToKeep, 'in_frac']
               , merge_year_frac.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)

#risk rating
ax2.set_xlabel('Deposit Money per Transaction') #in_frac
ax2.set_ylabel('Withdrawl Money per Transaction') #out_frac
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_year_frac['rating_lbl'] == target
    ax2.scatter(merge_year_frac.loc[indicesToKeep, 'in_frac']
               , merge_year_frac.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()

### K-Means (k-means++) on all continuous variables (standardized) in merge_yr

In [None]:
#do clustering
nClusters = 3
clust3_lbl_year = do_kmeans(merge_year, feats_use=cols_cont, k=nClusters)

In [None]:
#calculate the silhouette score (higher means better clustering performance)
s_score = metrics.silhouette_score(merge_year[cols_cont], clust3_lbl_year, metric="euclidean")
print('Number of clusters =', str(nClusters), ': Silhouette score =', str(s_score))

In [None]:
#Compare the clustering labels and the customer risk rating labels
clust3_lbl_year_df = pd.DataFrame(data=clust3_lbl_year).rename(columns={0:'clust3_lbl_year'})
clust3_rating_year = pd.concat([clust3_lbl_year_df, merge_year['rating_lbl']], axis=1)
clust3_rating_year.groupby(['clust3_lbl_year','rating_lbl']).size().reset_index()

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Deposit money') #in_amt
ax1.set_ylabel('Withdrawl money') #out_amt
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_year_df['clust3_lbl_year']==target #merge_cont_zscored['clustLbls_k3'] == target
    ax1.scatter(merge_year.loc[indicesToKeep, 'in_amt']
               , merge_year.loc[indicesToKeep, 'out_amt']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)


#risk rating
ax2.set_xlabel('Deposit money') #in_amt
ax2.set_ylabel('Withdrawl money') #out_amt
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_year['rating_lbl'] == target
    ax2.scatter(merge_year.loc[indicesToKeep, 'in_amt']
               , merge_year.loc[indicesToKeep, 'out_amt']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()

## K-Means on the data summed across months (Only based on Cash transaction)
### K-Means (k-means++) on the following variables (standardized) in merge_year
* in_frac, in_cnt
* out_frac, out_cnt

In [None]:
#Features to be used
cols_feat = ['in_frac','in_cnt','out_frac','out_cnt']

#Remove rows with NaN in in_frac and out_frac
merge_year_frac_cash = merge_year[merge_year['trsactn_type']=='cash'].dropna(subset=['in_frac','out_frac'])

#Clustering
nClusters = 3
clust3_lbl_year_frac_cash = do_kmeans(merge_year_frac_cash, feats_use=cols_feat, k=nClusters)
clust3_lbl_year_frac_cash_df = pd.DataFrame(data=clust3_lbl_year_frac_cash,
                                          index=merge_year_frac_cash.index).rename(columns={0:'clust3_lbl'})

In [None]:
#Calculate the silhouette score (higher means better clustering performance)
s_score = metrics.silhouette_score(merge_year_frac_cash[cols_feat], clust3_lbl_year_frac_cash, metric="euclidean")
print('Number of clusters =', str(nClusters), ': Silhouette score =', str(s_score))

In [None]:
#Compare the clustering labels and the customer risk rating labels
clust3_rating_year_frac_cash = pd.concat([clust3_lbl_year_frac_cash_df, merge_year_frac_cash['rating_lbl']], axis=1)
clust3_rating_year_frac_cash.groupby(['clust3_lbl','rating_lbl']).size().reset_index()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Cash Deposit per Transaction') #in_frac
ax1.set_ylabel('Cash Withdrawl per Transaction') #out_frac
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_year_frac_cash_df['clust3_lbl']==target
    ax1.scatter(merge_year_frac_cash.loc[indicesToKeep, 'in_frac']
               , merge_year_frac_cash.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)

#risk rating
ax2.set_xlabel('Cash Deposit per Transaction') #in_frac
ax2.set_ylabel('Cash Withdrawl per Transaction') #out_frac
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_year_frac_cash['rating_lbl'] == target
    ax2.scatter(merge_year_frac_cash.loc[indicesToKeep, 'in_frac']
               , merge_year_frac_cash.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Count of Cash Deposit') 
ax1.set_ylabel('Cash Deposit per Transaction') 
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_year_frac_cash_df['clust3_lbl']==target
    ax1.scatter(merge_year_frac_cash.loc[indicesToKeep, 'in_cnt']
               , merge_year_frac_cash.loc[indicesToKeep, 'in_frac']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)

#risk rating
ax2.set_xlabel('Count of Cash Deposit') 
ax2.set_ylabel('Cash Deposit per Transaction') 
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_year_frac_cash['rating_lbl'] == target
    ax2.scatter(merge_year_frac_cash.loc[indicesToKeep, 'in_cnt']
               , merge_year_frac_cash.loc[indicesToKeep, 'in_frac']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, sharex=True, sharey=True, figsize=(14,5))

#clustering label
ax1.set_xlabel('Count of Cash Withdrawl') 
ax1.set_ylabel('Cash Withdrawl per Transaction') 
targets = ['0', '1', '2']
colors = ['c', 'm', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = clust3_lbl_year_frac_cash_df['clust3_lbl']==target
    ax1.scatter(merge_year_frac_cash.loc[indicesToKeep, 'out_cnt']
               , merge_year_frac_cash.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax1.set_title('Clustering labels')
ax1.legend(targets)

#risk rating
ax2.set_xlabel('Count of Cash Withdrawl') 
ax2.set_ylabel('Cash Withdrawl per Transaction') 
targets = ['low', 'medium', 'high']
colors = ['g', 'b', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = merge_year_frac_cash['rating_lbl'] == target
    ax2.scatter(merge_year_frac_cash.loc[indicesToKeep, 'out_cnt']
               , merge_year_frac_cash.loc[indicesToKeep, 'out_frac']
               , c = color
               , alpha=0.3)
ax2.set_title('Risk ratings')
ax2.legend(targets)

plt.show()