# Anomaly/Outlier Detection

Python tool used: Pyod

### Step -1: Import Packages

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

### Step-2: Pyod Packages

In [2]:
from pyod.models.pca import PCA 
# Principal Component Analysis- Selecting only siginificant feautures. 
# Based on weighted projected distances to eigenvector hyperplane as outlier scores

In [3]:
from pyod.models.mcd import MCD
# Minimum Covariance Determinant

In [4]:
from pyod.models.ocsvm import OCSVM # One-class Support Vector machine
from pyod.models.lof import LOF # Local Outlier Factor
from pyod.models.cblof import CBLOF # Clustering based LOF
from pyod.models.knn import KNN # K-Nearest neighbour
from pyod.models.hbos import HBOS # Histrogram based Outlier Score
from pyod.models.abod import ABOD # Angle Based Outlier Detection
from pyod.models.iforest import IForest # Isolation forest
from pyod.models.feature_bagging import FeatureBagging 




### Step 3: Metric Packages

In [5]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [6]:
ls data

 Volume in drive C is Windows
 Volume Serial Number is 36A3-5FA6

 Directory of C:\Users\VAISHNAVI PATIL\data

19-08-2020  12:25    <DIR>          .
19-08-2020  12:25    <DIR>          ..
19-08-2020  12:07           105,540 arrhythmia.mat
19-08-2020  12:08            68,318 cardio.mat
19-08-2020  12:08             4,967 glass.mat
19-08-2020  12:08            58,517 ionosphere.mat
19-08-2020  12:08            23,146 letter.mat
19-08-2020  12:09             1,269 lympho.mat
19-08-2020  12:09           541,898 mnist.mat
19-08-2020  12:09           736,083 musk.mat
19-08-2020  12:09           197,453 optdigits.mat
19-08-2020  12:09           658,866 pendigits.mat
19-08-2020  12:09            11,320 pima.mat
19-08-2020  12:09               337 README.md
19-08-2020  12:10           144,833 satellite.mat
19-08-2020  12:10           181,220 satimage-2.mat
19-08-2020  12:10           345,184 shuttle.mat
19-08-2020  12:10             4,891 vertebral.mat
19-08-2020  12:10           134,107 vowels

### Step-4: Matlab Files:

In [7]:
mat_files_list = ['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']

Exploring .mat files

In [8]:
file = loadmat('data/glass.mat')

In [9]:
file

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 06:10:37 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 1.52101, 13.64   ,  4.49   , ...,  8.75   ,  0.     ,  0.     ],
        [ 1.51761, 13.89   ,  3.6    , ...,  7.83   ,  0.     ,  0.     ],
        [ 1.51618, 13.53   ,  3.55   , ...,  7.78   ,  0.     ,  0.     ],
        ...,
        [ 1.52065, 14.36   ,  0.     , ...,  8.44   ,  1.64   ,  0.     ],
        [ 1.51651, 14.38   ,  0.     , ...,  8.48   ,  1.57   ,  0.     ],
        [ 1.51711, 14.23   ,  0.     , ...,  8.62   ,  1.67   ,  0.     ]]),
 'y': array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
 

In [10]:
type(file)

dict

In [11]:
len(file)  # As file is dictionary, len gives  

5

In [12]:
file.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [13]:
file.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 06:10:37 UTC', '1.0', [], array([[ 1.52101, 13.64   ,  4.49   , ...,  8.75   ,  0.     ,  0.     ],
       [ 1.51761, 13.89   ,  3.6    , ...,  7.83   ,  0.     ,  0.     ],
       [ 1.51618, 13.53   ,  3.55   , ...,  7.78   ,  0.     ,  0.     ],
       ...,
       [ 1.52065, 14.36   ,  0.     , ...,  8.44   ,  1.64   ,  0.     ],
       [ 1.51651, 14.38   ,  0.     , ...,  8.48   ,  1.57   ,  0.     ],
       [ 1.51711, 14.23   ,  0.     , ...,  8.62   ,  1.67   ,  0.     ]]), array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
  

In [14]:
type(file['X']),type(file['y'])

(numpy.ndarray, numpy.ndarray)

In [15]:
file['X'].shape, file['y'].shape

((214, 9), (214, 1))

### Step- 5 Finding best method for anomaly detection and detecting outliers in files

Defining outlier detection tools to be compared

In [16]:
df_columns = ['Data','#Samples','#Dimensions','Outliers Perc','ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

ROC: Region of Characteristics - Performance Evaluation Matrix
    
prn_df : Precision_n_scores
    
Time_df: Time for exectution
    
All are empty dataframes.

In [17]:
roc_df = pd.DataFrame(columns=df_columns)
prn_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)

Exploring Mat files

In [24]:
random_state = np.random.RandomState(42)
from time import time

for mat_file in mat_files_list:
    print('\n...Processing',mat_file,'....')
    mat=loadmat(os.path.join('data',mat_file))
    
    X=mat['X']
    y=mat['y'].ravel()  # converts multidimensional arrays into flattened 1-d array
    
    outliers_fraction=np.count_nonzero(y)/len(y)  # count(y)- gives number of entries with value y, len(y) - total values in y array
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    #construct containers to save results
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    
    # Split Train and Test Data
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=random_state)
    
    #Normalizing Data
    X_train_norm,X_test_norm=standardizer(X_train,X_test)
    
    classifiers={'PCA':PCA(contamination=outliers_fraction,random_state=random_state),
                 'MCD':MCD(contamination=outliers_fraction,random_state=random_state),
                 'OCSVM':OCSVM(contamination=outliers_fraction),'LOF':LOF(contamination=outliers_fraction),
                 'CBLOF':CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state)
                 ,'KNN':KNN(contamination=outliers_fraction),'HBOS':HBOS(contamination=outliers_fraction),
                 'ABOD':ABOD(contamination=outliers_fraction),
                 'Isolation Forest':IForest(contamination=outliers_fraction,random_state=random_state),
                 'Feaute bagging':FeatureBagging(contamination=outliers_fraction,random_state=random_state)}
    
    for clf_name,clf in classifiers.items():
        t0=time()  #training time starts
        clf.fit(X_train_norm)  #Training the model
        test_scores=clf.decision_function(X_test_norm) #test the model
        t1=time()  #training ends
        duration=round(t1-t0,ndigits=4)  # tot training model
        time_list.append(duration)
        
        roc = round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn = round(precision_n_scores(y_test,test_scores),ndigits=4)
        
        print('{clf_name} ROC: {roc}, precision @ rank n: {prn},'
             'execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))
        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df,temp_df],axis=0)
    
    
        


...Processing arrhythmia.mat ....
PCA ROC: 0.7997, precision @ rank n: 0.5,execution time: 0.4927s




MCD ROC: 0.8228, precision @ rank n: 0.4286,execution time: 1.2714s
OCSVM ROC: 0.7986, precision @ rank n: 0.5,execution time: 0.1997s
LOF ROC: 0.7787, precision @ rank n: 0.4643,execution time: 0.2721s
CBLOF ROC: 0.788, precision @ rank n: 0.4643,execution time: 6.7782s
KNN ROC: 0.782, precision @ rank n: 0.5,execution time: 0.2211s
HBOS ROC: 0.8511, precision @ rank n: 0.5714,execution time: 3.9253s
ABOD ROC: 0.7687, precision @ rank n: 0.3571,execution time: 2.5104s
Isolation Forest ROC: 0.8343, precision @ rank n: 0.5357,execution time: 1.4543s
Feaute bagging ROC: 0.7768, precision @ rank n: 0.4286,execution time: 1.1798s

...Processing cardio.mat ....
PCA ROC: 0.9444, precision @ rank n: 0.6267,execution time: 0.0159s




MCD ROC: 0.8221, precision @ rank n: 0.44,execution time: 1.1839s
OCSVM ROC: 0.9292, precision @ rank n: 0.5467,execution time: 0.1664s
LOF ROC: 0.5969, precision @ rank n: 0.1867,execution time: 0.2076s
CBLOF ROC: 0.8566, precision @ rank n: 0.52,execution time: 0.34s
KNN ROC: 0.7601, precision @ rank n: 0.3467,execution time: 0.3428s
HBOS ROC: 0.8684, precision @ rank n: 0.4533,execution time: 0.0151s
ABOD ROC: 0.6114, precision @ rank n: 0.2533,execution time: 0.8394s
Isolation Forest ROC: 0.9309, precision @ rank n: 0.6,execution time: 0.9112s
Feaute bagging ROC: 0.6202, precision @ rank n: 0.1867,execution time: 1.8599s

...Processing glass.mat ....
PCA ROC: 0.8512, precision @ rank n: 0.0,execution time: 0.0074s
MCD ROC: 0.7738, precision @ rank n: 0.0,execution time: 0.1024s
OCSVM ROC: 0.9405, precision @ rank n: 0.0,execution time: 0.007s
LOF ROC: 0.9762, precision @ rank n: 0.0,execution time: 0.0071s




CBLOF ROC: 0.9405, precision @ rank n: 0.0,execution time: 0.1167s
KNN ROC: 0.9405, precision @ rank n: 0.0,execution time: 0.0273s
HBOS ROC: 0.8452, precision @ rank n: 0.0,execution time: 0.011s
ABOD ROC: 0.9524, precision @ rank n: 0.0,execution time: 0.0887s
Isolation Forest ROC: 0.9048, precision @ rank n: 0.0,execution time: 0.7634s
Feaute bagging ROC: 0.9762, precision @ rank n: 0.0,execution time: 0.0845s

...Processing ionosphere.mat ....
PCA ROC: 0.7879, precision @ rank n: 0.54,execution time: 0.0133s
MCD ROC: 0.9556, precision @ rank n: 0.86,execution time: 0.1844s
OCSVM ROC: 0.8543, precision @ rank n: 0.74,execution time: 0.009s
LOF ROC: 0.8943, precision @ rank n: 0.76,execution time: 0.019s
CBLOF ROC: 0.8756, precision @ rank n: 0.76,execution time: 0.159s
KNN ROC: 0.9295, precision @ rank n: 0.86,execution time: 0.0643s
HBOS ROC: 0.533, precision @ rank n: 0.36,execution time: 0.0349s
ABOD ROC: 0.9211, precision @ rank n: 0.82,execution time: 0.1632s
Isolation Forest R



MCD ROC: 1.0, precision @ rank n: 1.0,execution time: 0.1014s
OCSVM ROC: 1.0, precision @ rank n: 1.0,execution time: 0.004s
LOF ROC: 1.0, precision @ rank n: 1.0,execution time: 0.007s
CBLOF ROC: 1.0, precision @ rank n: 1.0,execution time: 0.1104s
KNN ROC: 1.0, precision @ rank n: 1.0,execution time: 0.0281s
HBOS ROC: 1.0, precision @ rank n: 1.0,execution time: 0.013s
ABOD ROC: 1.0, precision @ rank n: 1.0,execution time: 0.0659s
Isolation Forest ROC: 1.0, precision @ rank n: 1.0,execution time: 0.763s
Feaute bagging ROC: 1.0, precision @ rank n: 1.0,execution time: 0.0807s

...Processing mnist.mat ....
PCA ROC: 0.8477, precision @ rank n: 0.3675,execution time: 0.3324s




MCD ROC: 0.8677, precision @ rank n: 0.371,execution time: 6.3446s
OCSVM ROC: 0.8457, precision @ rank n: 0.3781,execution time: 11.4294s
LOF ROC: 0.707, precision @ rank n: 0.3357,execution time: 15.4705s
CBLOF ROC: 0.8432, precision @ rank n: 0.3958,execution time: 2.4868s
KNN ROC: 0.8417, precision @ rank n: 0.4205,execution time: 16.1865s
HBOS ROC: 0.5715, precision @ rank n: 0.1095,execution time: 0.1391s
ABOD ROC: 0.7947, precision @ rank n: 0.371,execution time: 17.684s
Isolation Forest ROC: 0.7933, precision @ rank n: 0.3216,execution time: 4.9076s
Feaute bagging ROC: 0.7049, precision @ rank n: 0.3498,execution time: 117.131s

...Processing musk.mat ....
PCA ROC: 0.9998, precision @ rank n: 0.9677,execution time: 0.2204s
MCD ROC: 1.0, precision @ rank n: 1.0,execution time: 15.4826s
OCSVM ROC: 1.0, precision @ rank n: 1.0,execution time: 1.5689s
LOF ROC: 0.6148, precision @ rank n: 0.1935,execution time: 1.9029s
CBLOF ROC: 1.0, precision @ rank n: 1.0,execution time: 0.4238s
K



MCD ROC: 0.3704, precision @ rank n: 0.0,execution time: 1.2177s
OCSVM ROC: 0.4634, precision @ rank n: 0.0,execution time: 1.5429s
LOF ROC: 0.4359, precision @ rank n: 0.0526,execution time: 1.7044s
CBLOF ROC: 0.7586, precision @ rank n: 0.0,execution time: 0.5415s
KNN ROC: 0.3506, precision @ rank n: 0.0,execution time: 2.0904s
HBOS ROC: 0.8247, precision @ rank n: 0.1579,execution time: 0.0359s
ABOD ROC: 0.484, precision @ rank n: 0.0175,execution time: 2.616s
Isolation Forest ROC: 0.6848, precision @ rank n: 0.0351,execution time: 1.0013s
Feaute bagging ROC: 0.4713, precision @ rank n: 0.0526,execution time: 15.8643s

...Processing pendigits.mat ....
PCA ROC: 0.9421, precision @ rank n: 0.3448,execution time: 0.0149s
MCD ROC: 0.8383, precision @ rank n: 0.0517,execution time: 2.7556s
OCSVM ROC: 0.9372, precision @ rank n: 0.3103,execution time: 1.0492s
LOF ROC: 0.418, precision @ rank n: 0.0345,execution time: 0.7599s
CBLOF ROC: 0.7858, precision @ rank n: 0.1207,execution time: 0.



MCD ROC: 0.9903, precision @ rank n: 0.7477,execution time: 12.8686s
OCSVM ROC: 0.9927, precision @ rank n: 0.957,execution time: 47.9314s
LOF ROC: 0.5325, precision @ rank n: 0.1283,execution time: 15.6181s
CBLOF ROC: 0.5863, precision @ rank n: 0.2657,execution time: 1.0562s
KNN ROC: 0.6463, precision @ rank n: 0.2271,execution time: 11.9251s
HBOS ROC: 0.9857, precision @ rank n: 0.9345,execution time: 0.021s
ABOD ROC: 0.6315, precision @ rank n: 0.1889,execution time: 18.4207s
Isolation Forest ROC: 0.9972, precision @ rank n: 0.9542,execution time: 4.1319s
Feaute bagging ROC: 0.449, precision @ rank n: 0.0437,execution time: 92.6652s

...Processing vertebral.mat ....
PCA ROC: 0.4428, precision @ rank n: 0.0667,execution time: 0.0025s
MCD ROC: 0.3885, precision @ rank n: 0.0,execution time: 0.0778s
OCSVM ROC: 0.4535, precision @ rank n: 0.0,execution time: 0.002s
LOF ROC: 0.3399, precision @ rank n: 0.0667,execution time: 0.004s
CBLOF ROC: 0.4041, precision @ rank n: 0.0,execution ti

In [25]:
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outliers Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.4927,1.2714,0.1997,0.2721,6.7782,0.2211,3.9253,2.5104,1.4543,1.1798
0,cardio,1831,21,9.6122,0.0159,1.1839,0.1664,0.2076,0.34,0.3428,0.0151,0.8394,0.9112,1.8599
0,glass,214,9,4.2056,0.0074,0.1024,0.007,0.0071,0.1167,0.0273,0.011,0.0887,0.7634,0.0845
0,ionosphere,351,33,35.8974,0.0133,0.1844,0.009,0.019,0.159,0.0643,0.0349,0.1632,0.8064,0.1999
0,letter,1600,32,6.25,0.0169,2.7097,0.1945,0.229,0.3074,0.3816,0.028,0.9367,1.0058,1.8387
0,lympho,148,18,4.0541,0.0059,0.1014,0.004,0.007,0.1104,0.0281,0.013,0.0659,0.763,0.0807
0,mnist,7603,100,9.2069,0.3324,6.3446,11.4294,15.4705,2.4868,16.1865,0.1391,17.684,4.9076,117.131
0,musk,3062,166,3.1679,0.2204,15.4826,1.5689,1.9029,0.4238,2.0007,0.0638,2.4654,1.4391,15.2532
0,optdigits,5216,64,2.8758,0.0539,1.2177,1.5429,1.7044,0.5415,2.0904,0.0359,2.616,1.0013,15.8643
0,pendigits,6870,16,2.2707,0.0149,2.7556,1.0492,0.7599,0.3311,0.7251,0.014,1.6895,0.9564,6.338


Arrhthmia File has 14.6% outliers and Forest Bagging Least time duration for training.

In [26]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outliers Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7997,0.8228,0.7986,0.7787,0.788,0.782,0.8511,0.7687,0.8343,0.7768
0,cardio,1831,21,9.6122,0.9444,0.8221,0.9292,0.5969,0.8566,0.7601,0.8684,0.6114,0.9309,0.6202
0,glass,214,9,4.2056,0.8512,0.7738,0.9405,0.9762,0.9405,0.9405,0.8452,0.9524,0.9048,0.9762
0,ionosphere,351,33,35.8974,0.7879,0.9556,0.8543,0.8943,0.8756,0.9295,0.533,0.9211,0.8503,0.8947
0,letter,1600,32,6.25,0.4951,0.7683,0.57,0.8452,0.721,0.8382,0.5515,0.8471,0.5665,0.8508
0,lympho,148,18,4.0541,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.8477,0.8677,0.8457,0.707,0.8432,0.8417,0.5715,0.7947,0.7933,0.7049
0,musk,3062,166,3.1679,0.9998,1.0,1.0,0.6148,1.0,0.7503,1.0,0.0853,0.9994,0.584
0,optdigits,5216,64,2.8758,0.481,0.3704,0.4634,0.4359,0.7586,0.3506,0.8247,0.484,0.6848,0.4713
0,pendigits,6870,16,2.2707,0.9421,0.8383,0.9372,0.418,0.7858,0.7372,0.9228,0.6691,0.9601,0.4224


ABOD has 79.97% accuracy in predicting outliers for arrhythmia  file.

In [27]:
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outliers Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.5,0.4286,0.5,0.4643,0.4643,0.5,0.5714,0.3571,0.5357,0.4286
0,cardio,1831,21,9.6122,0.6267,0.44,0.5467,0.1867,0.52,0.3467,0.4533,0.2533,0.6,0.1867
0,glass,214,9,4.2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,ionosphere,351,33,35.8974,0.54,0.86,0.74,0.76,0.76,0.86,0.36,0.82,0.68,0.76
0,letter,1600,32,6.25,0.0833,0.1111,0.0833,0.3611,0.1111,0.2222,0.0556,0.2222,0.0278,0.3611
0,lympho,148,18,4.0541,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.3675,0.371,0.3781,0.3357,0.3958,0.4205,0.1095,0.371,0.3216,0.3498
0,musk,3062,166,3.1679,0.9677,1.0,1.0,0.1935,1.0,0.2903,0.9677,0.0323,0.9355,0.1613
0,optdigits,5216,64,2.8758,0.0,0.0,0.0,0.0526,0.0,0.0,0.1579,0.0175,0.0351,0.0526
0,pendigits,6870,16,2.2707,0.3448,0.0517,0.3103,0.0345,0.1207,0.0345,0.2759,0.0345,0.3621,0.0345


FB has 54.67% precision in detecting outliers in cardio file.

Thhus, ionosphere file has max ouliers(35.89%)