# Anomaly Detection using Isolation Forests

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
# df['Amount'] = MinMaxScaler().fit_transform(df['Amount'].values.reshape(-1,1))

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Calculate the percentage of fraudulent transactions on labeled dataset to choose as contamination rate for Isolation Forest:

In [5]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [6]:
contamination = round(492/284315, 4)
contamination

0.0017

Split train and test datasets regarding the time (70/30)

In [7]:
df_train = df.drop(columns=['Class', 'Time'])
df_test = df.drop(columns=['Time'])
# df_train = df.iloc[0:round(len(df)*0.7)].copy()
# df_test = df.iloc[round(len(df)*0.7):].copy()

In [8]:
# df_train = df_train.drop(columns=['Class', 'Time', 'Amount'])
# df_test = df_test.drop(columns=['Time', 'Amount'])

Train unsupervised model with unlabeled dataset.

In [9]:
model = IsolationForest(n_estimators=100, max_samples='auto', contamination=contamination, max_features=1.0)

In [10]:
model.fit(df_train)

IsolationForest(contamination=0.0017)

In [11]:
scores = model.decision_function(df_train)
anomaly = model.predict(df_train)

In [12]:
df_train['scores'] = scores
df_train['anomaly'] = anomaly

In [13]:
df_train.sample(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,scores,anomaly
270182,2.074446,-0.051311,-1.125447,0.381088,-0.071318,-1.129921,0.180242,-0.325239,0.552013,0.073988,...,-0.725924,0.322798,-0.110465,-0.278422,0.205186,-0.070767,-0.061594,0.89,0.283189,1
167012,1.961458,0.298406,-2.109769,0.83341,0.439954,-1.096451,0.16929,-0.19662,0.378535,-0.914761,...,0.475696,-0.049442,-0.106097,0.164311,0.704779,-0.034488,-0.020146,12.31,0.257676,1
10835,1.078624,-0.301565,1.429603,1.130334,-1.194051,-0.014737,-0.903896,0.224024,2.503583,-0.603231,...,-0.263649,0.120773,0.331576,0.037134,0.292187,-0.010337,0.01978,28.75,0.244576,1
79730,-0.926324,0.431985,1.051679,-1.433141,-1.405051,1.004504,-2.554209,-5.08509,-1.418752,-1.050937,...,0.116544,-0.16192,-0.028926,1.161491,-0.433349,0.009812,0.174898,97.99,0.186616,1
20021,1.521927,-1.117359,0.105876,-1.519436,-0.946673,0.262277,-1.061673,0.009803,-1.757256,1.441678,...,-0.657415,-0.105422,-0.827495,0.557209,-0.217415,0.038962,0.000522,20.36,0.248948,1
80697,1.246103,0.37499,0.308599,0.691633,-0.326985,-1.055426,0.130505,-0.235387,-0.048864,-0.301027,...,-0.790059,0.117263,0.354762,0.232842,0.092348,-0.020652,0.032906,7.46,0.289581,1
219051,1.976323,-0.130516,-1.478505,0.395703,-0.042205,-1.502,0.456258,-0.400838,0.575771,0.001721,...,-0.406974,0.19439,-0.003309,-0.055735,-0.496787,-0.045039,-0.047929,53.99,0.276536,1
133103,1.105083,0.095438,0.503572,1.492774,-0.399885,-0.379883,0.042413,0.007338,0.394743,-0.108986,...,-0.075736,-0.028014,0.39874,0.584208,-0.332157,0.031227,0.02078,27.1,0.287075,1
69003,-0.77451,0.351818,2.592611,-1.977984,-0.770797,-0.769754,0.067784,0.093489,1.113209,-1.794762,...,0.970546,-0.411986,0.413941,0.49881,-0.636314,0.117561,0.083312,9.99,0.237053,1
53976,1.165798,-0.317388,1.295899,0.182243,-1.162435,-0.180699,-0.744596,0.138758,1.007516,-0.441157,...,0.064854,0.132054,0.486502,0.011561,0.999437,-0.013104,0.015742,1.0,0.276625,1


In [14]:
df_train['anomaly'] = df_train['anomaly'].replace(1,0)
df_train['anomaly'] = df_train['anomaly'].replace(-1,1)

In [15]:
df_train['anomaly'].value_counts()

0    284322
1       485
Name: anomaly, dtype: int64

Check if the model can make a good prediction on test dataset

In [16]:
df_test.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [17]:
is_fraud = df_test['Class']
df_test = df_test.drop(columns=['Class'])

In [18]:
scores = model.decision_function(df_test)
anomaly = model.predict(df_test)

In [19]:
df_test['scores'] = scores
df_test['anomaly'] = anomaly
df_test['is_fraud'] = is_fraud

In [20]:
df_test['anomaly'] = df_test['anomaly'].replace(1,0)
df_test['anomaly'] = df_test['anomaly'].replace(-1,1)

In [21]:
df_test.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,Amount,scores,anomaly,is_fraud
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.279812,0,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.297002,0,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.19643,0,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.244303,0,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.255477,0,0


In [22]:
accuracy_score(df_test['is_fraud'], df_test['anomaly'])

0.9974965502954632

In [23]:
only_frauds = df_test[df_test['anomaly'] == 1]

In [24]:
only_frauds.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,Amount,scores,anomaly,is_fraud
1632,-11.140706,-9.612726,-12.389545,6.013346,-32.092129,21.393069,34.303177,-7.520784,-1.925732,-2.636622,...,-2.925888,0.843551,0.746267,0.801387,3.852046,4.157934,7712.43,-0.040277,1,0
2963,-6.200114,5.025406,-2.742492,-0.940903,-6.656259,5.432294,-9.198175,-22.588547,-3.244295,-5.453339,...,1.943002,0.846005,-1.343379,-0.707634,-0.210219,0.066529,544.62,-0.003555,1,0
5425,-8.733429,-5.681953,2.253879,3.930311,-0.916149,4.355297,5.400286,-4.994383,6.819787,6.76486,...,-1.428302,-0.030181,-0.029485,-0.264455,-7.9761,4.71256,553.6,-0.030867,1,0
6783,-6.571336,-11.943892,-4.246322,6.825816,-2.896624,2.706661,5.373923,-0.872187,-0.626203,-0.955356,...,-3.731391,-1.015761,-0.740824,-0.297883,-0.79371,0.680622,4002.88,-0.015486,1,0
6812,-23.066842,-25.640527,-3.080313,4.866932,6.901397,-4.074335,2.027345,-2.688652,3.402966,2.881873,...,13.876221,-0.572177,5.525093,0.90698,3.358822,3.553906,845.73,-0.058301,1,0


In [25]:
accuracy_score(only_frauds['is_fraud'], only_frauds['anomaly'])

0.2721649484536082

Unfortunately the model only got one right.