# Outlier detection

- Local Outlier Factor (LOF)

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn import metrics

from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [3]:
from sklearn.neighbors import LocalOutlierFactor

## Import dataset

In [4]:
X = pd.read_csv('large_cleaned.csv', index_col = 0)

In [5]:
y = pd.read_csv('jeopardy.csv', index_col = 0)

In [6]:
data = pd.merge(X, y, left_index = True, right_index = True)

In [7]:
data.head()

Unnamed: 0_level_0,category,turnover,transaction_count,io_ratio,age,nationality,is_pep,inactive_days_average,inactive_days_max,n_of_accounts,...,transaction_count_log,distinct_counterparties_log,atm_withdrawal_norm_log,atm_deposit_norm_log,inactive_days_average_log,new_category,category_name,no_atm_deposit,no_atm_withdrawal,suspicious
cif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90000000,0,217673.05,125,0.304,40,123,0,1.66,88,1,...,2.100371,0.60206,0.003848,0.004292,0.424882,0,normal_ind,0,0,0
90000001,0,57.42,20,0.25,17,90,0,7.32,88,4,...,1.322219,0.845098,0.035982,0.006709,0.920123,0,normal_ind,0,0,0
90000002,0,1465076.9,1,0.0,66,123,0,24.08,25,2,...,0.30103,0.30103,0.0,6.8e-05,1.399328,0,normal_ind,0,1,0
90000003,0,429840.54,46,0.23913,43,90,0,3.88,86,2,...,1.672098,1.255273,0.041521,0.003143,0.68842,0,normal_ind,0,0,0
90000004,2,93356190.65,63,0.365079,0,186,0,2.77,100,1,...,1.80618,0.30103,0.008359,0.002796,0.576341,2,institution,0,0,0


## Dropping unused columns and fillna 0

In [8]:
def drop_columns(data):
    data_y = data['suspicious']
    
    data.drop(['category', 'turnover', 'transaction_count', 'nationality', 'is_pep', 
        'n_of_accounts', 'category_name', 'inactive_days_average', 'inactive_days_max',
       'distinct_counterparties', 'atm_withdrawal', 'atm_deposit', 'transaction_avg', 
        'cpi_risk_ranking', 'aml_risk_ranking', 'country_name', 'suspicious', 'new_category'], axis = 1, inplace = True)
 
    return data, data_y

In [9]:
data.fillna(0, inplace = True)

In [10]:
data, data_y = drop_columns(data)

In [11]:
data

Unnamed: 0_level_0,io_ratio,age,channel_risk,aml_risk_score,cpi_risk_score,atm_withdrawal_norm,atm_deposit_norm,turnover_log,atm_withdrawal_log,atm_deposit_log,transaction_count_log,distinct_counterparties_log,atm_withdrawal_norm_log,atm_deposit_norm_log,inactive_days_average_log,no_atm_deposit,no_atm_withdrawal
cif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
90000000,0.304000,40,0.254742,0.00,53.0,0.008899,0.009931,5.337807,3.287387,3.335014,2.100371,0.602060,0.003848,0.004292,0.424882,0,0
90000001,0.250000,17,0.393174,5.11,73.0,0.086381,0.015567,1.766562,0.775246,0.277349,1.322219,0.845098,0.035982,0.006709,0.920123,0,0
90000002,0.000000,66,0.422991,0.00,53.0,0.000000,0.000157,6.165861,0.000000,2.362357,0.301030,0.301030,0.000000,0.000068,1.399328,0,1
90000003,0.239130,43,0.296019,5.11,73.0,0.100324,0.007263,5.633308,4.634722,3.494564,1.672098,1.255273,0.041521,0.003143,0.688420,0,0
90000004,0.365079,0,0.067511,0.00,0.0,0.019435,0.006459,7.970143,6.258719,5.780337,1.806180,0.301030,0.008359,0.002796,0.576341,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90999995,0.500000,24,0.041824,0.00,57.0,0.018441,0.000724,5.690634,3.956471,2.551357,0.477121,0.301030,0.007936,0.000314,1.798236,0,0
90999996,0.500000,0,0.098501,5.80,39.0,0.128712,0.000592,7.837913,6.947532,4.610013,0.477121,0.301030,0.052583,0.000257,1.691170,0,0
90999997,0.333333,0,0.383976,0.00,0.0,0.000033,0.005905,7.318365,2.842085,5.089602,0.602060,0.301030,0.000014,0.002557,1.948462,0,0
90999998,0.000000,0,0.063274,0.00,57.0,0.000000,0.000136,6.292454,0.000000,2.428954,0.301030,0.301030,0.000000,0.000059,1.167022,0,1


In [12]:
data_y

cif
90000000    0
90000001    0
90000002    0
90000003    0
90000004    0
           ..
90999995    0
90999996    0
90999997    0
90999998    0
90999999    0
Name: suspicious, Length: 999232, dtype: int64

## StandardScaler (for PCA)

In [13]:
sc = StandardScaler()
data_sc = sc.fit_transform(data)
data_sc = pd.DataFrame(data_sc, index=data.index)

## Add column names back
data_sc.columns = data.columns

In [14]:
data_sc

Unnamed: 0_level_0,io_ratio,age,channel_risk,aml_risk_score,cpi_risk_score,atm_withdrawal_norm,atm_deposit_norm,turnover_log,atm_withdrawal_log,atm_deposit_log,transaction_count_log,distinct_counterparties_log,atm_withdrawal_norm_log,atm_deposit_norm_log,inactive_days_average_log,no_atm_deposit,no_atm_withdrawal
cif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
90000000,-0.089874,0.321571,-0.001868,-1.074205,-0.075879,-0.544307,1.000493,-0.155619,-0.021471,0.549570,1.809393,0.133767,-0.575961,1.009252,-1.573449,-0.361035,-0.435488
90000001,-0.264798,-0.571597,0.791529,0.863002,0.902534,-0.323223,1.949879,-2.967102,-1.305560,-1.425025,0.620600,0.727532,-0.271072,1.958695,-0.727201,-0.361035,-0.435488
90000002,-1.074630,1.331239,0.962422,-1.074205,-0.075879,-0.569700,-0.646141,0.496272,-1.701829,-0.078558,-0.939487,-0.601678,-0.612470,-0.650115,0.091644,-0.361035,2.296275
90000003,-0.300008,0.438071,0.234705,0.863002,0.902534,-0.283439,0.550993,0.077017,0.667223,0.652604,1.155115,1.729627,-0.218524,0.557875,-1.123126,-0.361035,-0.435488
90000004,0.107982,-1.231765,-1.074949,-1.074205,-2.668672,-0.514246,0.415621,1.916703,1.497334,2.128723,1.359954,-0.601678,-0.533156,0.421704,-1.314641,-0.361035,-0.435488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90999995,0.545035,-0.299764,-1.222168,-1.074205,0.119804,-0.517080,-0.550614,0.122146,0.320533,0.043495,-0.670470,-0.601678,-0.537173,-0.553407,0.773282,-0.361035,-0.435488
90999996,0.545035,-1.231765,-0.897336,1.124582,-0.760768,-0.202438,-0.572837,1.812605,1.849423,1.372945,-0.670470,-0.601678,-0.113564,-0.575900,0.590332,-0.361035,-0.435488
90999997,0.005147,-1.231765,0.738816,-1.074205,-2.668672,-0.569605,0.322257,1.403587,-0.249089,1.682656,-0.479599,-0.601678,-0.612332,0.327725,1.029982,-0.361035,-0.435488
90999998,-1.074630,-1.231765,-1.099229,-1.074205,0.119804,-0.569700,-0.649529,0.595933,-1.701829,-0.035550,-0.939487,-0.601678,-0.612470,-0.653546,-0.305311,-0.361035,2.296275


## Outlier (predicted/real)

In [15]:
pca = PCA(n_components=3) #PC1, PC2, PC3

In [16]:
lof = LocalOutlierFactor(n_neighbors=20, contamination = 'auto') #contamination = %of outliers => searched by computer

In [17]:
principalComponents = pca.fit_transform(data_sc.iloc[:,:-1])

In [18]:
print('explained variance in 3 PC: ', np.sum(pca.explained_variance_ratio_))

explained variance in 3 PC:  0.5230580371099536


In [19]:
pred = lof.fit_predict(principalComponents)

In [20]:
data_sc['anomaly_LOF'] = pred

In [21]:
outliers = data_sc.loc[data_sc['anomaly_LOF']== -1]

In [22]:
outlier_index = list(outliers.index)

In [23]:
suspicious = data_y[outlier_index][data_y.loc[outlier_index]== 1]

In [24]:
suspicious_index = list(suspicious.index)

In [25]:
len(suspicious_index)

54

In [26]:
print('number of predicted outliers by LOF \n', data_sc['anomaly_LOF'].value_counts())

number of predicted outliers by LOF 
  1    997081
-1      2151
Name: anomaly_LOF, dtype: int64


In [27]:
print('number of suspicious clients', len(suspicious))

number of suspicious clients 54


# Conclusion

Suspicious clients can not be easily identified by a common outlier detection