In [1]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
from sympy import *

In [113]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [137]:
class OutlierDetector:
    def __init__(self, points = None, proximity_matrix = None, pt_list = None, distance='Euclidean', rounding_digit=2):
        self.proximity_matrix = proximity_matrix
        self.points = points
        self.pt_list = pt_list
        if self.pt_list:
            self.proximity_matrix = pd.DataFrame(proximity_matrix, index=self.pt_list, columns=self.pt_list)
            self.points_df = pd.Series(self.pt_list, index=self.pt_list)
        elif not self.pt_list and not self.points:
            self.pt_list = [f'P{i+1}' for i in range(len(self.proximity_matrix))]
            self.points_df = pd.Series(self.pt_list, index=self.pt_list)
            self.proximity_matrix = pd.DataFrame(self.proximity_matrix, index=self.pt_list, columns=self.pt_list)
        self.distance = distance
        self.rounding_digit = rounding_digit
        if self.proximity_matrix is None:
            self.prepare_proximity_matrix()
        
    def cal_distance(self, x1, x2):
        if self.distance == 'Euclidean':
            return round(dist.euclidean(x1,x2),self.rounding_digit)
        elif self.distance == 'Manhattan':
            return sum([abs(i-j) for i,j in zip(x1,x2)])
    
    def prepare_proximity_matrix(self):
        
        self.pt_list = [f'P{i+1}' for i in range(len(self.points))]
        self.points_df = pd.Series(self.points, index=self.pt_list)
        #print(self.points_df.to_string())
        #self.pt_list = [f'P{i+1}' for i in range(len(self.points))]
        self.proximity_matrix = pd.DataFrame(index=self.pt_list, columns=self.pt_list)
        self.proximity_matrix.fillna('', inplace=True)        
        
        for i in range(len(self.points)):
            for j in range(len(self.points)):                
                self.proximity_matrix.iloc[i,j] = self.cal_distance(self.points[i], self.points[j])
    
    def detect(self, method, k):
        self.k = k
        if method == 'proximity-based':
            self.detect_proximity_based()
        elif method == 'ard-based':
            self.detect_ard_based()        
    
    def detect_proximity_based(self):
        print('\nProximity based outlier detection')
        display(self.proximity_matrix)
        print(f'\nk = {self.k}')
        print('\nOutlier score, OS for a point p is given by:')
        display(Eq(Symbol('OS(p)'), Symbol('d(p, x_{1}) + d(p, x_{2})+...+d(p, x_{k})')/Symbol('k')))
        max_os = 0
        max_pt = None
        for pt in self.pt_list:
            if self.points:
                print(f'\nPoint {pt} = {self.points_df[pt]}:')
            else:
                print(f'\nPoint {pt}:')
            nn = self.proximity_matrix.loc[pt].sort_values(ascending=True)[1:self.k+1]
            print(f'\nNearest {self.k} neighbours are {nn.index.values}')
            s = ''
            sum_s = 0
            for n in nn:
                s+=f'{n} +'
                sum_s +=n
            s = s[:-1]
            os = sum_s/self.k
            if os > max_os:
                max_os = os
                max_pt = pt
            display(Eq(Symbol(f'OS({pt})'), Symbol(s)/Symbol(f'{self.k}')))
            display(Eq(Symbol(f'OS({pt})'), Symbol(f'{sum_s}')/Symbol(f'{self.k}')))
            display(Eq(Symbol(f'OS({pt})'), os))
            print('\n----------------------------------------------------------')
        print(f'\nOutlier Score for point {max_pt} is the highest, so it is termed as outlier')
    
    def detect_ard_based(self):
        print('\nAverage Relative Density based outlier detection')
        display(self.proximity_matrix)
        print(f'\nk = {self.k}')
#         print('\nOutlier score, OS for a point p is given by:')
#         display(Eq(Symbol('OS(p)'), Symbol('d(p, x_{1}) + d(p, x_{2})+...+d(p, x_{k})')/Symbol('k')))
        print('\nFor all points x, determine local reachability density, lrd(x,k) for k-nearest neighbours using formula\n')
        display(Eq(Symbol('lrd(x,k)'), Symbol('|N(x,k)|')/Symbol('\u03A3_{y\u2208N(x,k)}dist(x,y)')))
        

        density_dict = {}
        for pt in self.pt_list:
            if self.points:
                print(f'\nPoint {pt} = {self.points_df[pt]}:')
            else:
                print(f'\nPoint {pt}:')
            nn = self.proximity_matrix.loc[pt].sort_values(ascending=True)[1:self.k+1]
            print(f'\nNearest {self.k} neighbours are {nn.index.values}')
            s1 = ''
            s2 = ''
            sum_s = 0
            for i,n in enumerate(nn):
                s1 += f'dist({pt},{nn.index.values[i]}) +'
                s2+=f'{n} +'
                sum_s +=n
            s1 = s1[:-1]
            s2 = s2[:-1]
            density = round(self.k/sum_s, self.rounding_digit)
            density_dict[pt] = density
#             if os > max_os:
#                 max_os = os
#                 max_pt = pt
            display(Eq(Symbol(f'lrd({pt},{self.k})'), Symbol(f'{self.k}')/Symbol(s1)))
            display(Eq(Symbol(f'lrd({pt},{self.k})'), Symbol(f'{self.k}')/Symbol(s2)))
            display(Eq(Symbol(f'lrd({pt},{self.k})'), Symbol(f'{self.k}')/Symbol(f'{sum_s}')))
            display(Eq(Symbol(f'lrd({pt},{self.k})'), density))
            print('\n----------------------------------------------------------')
        
        print('\n==================================================================================')
        print('\nFor all points x, determine Local Outlier Factor LOF(x,k) using Average relative density, ARD(x,k), given by:')
        display(Eq(Symbol('ARD(x,k)'), Symbol('lrd(x,k)')/(Symbol('\u03A3_{y\u2208N(x,k)}lrd(y,k) / |N(x,k)|'))))
        
        
        min_lof = np.inf
        min_pt = None
        
        for pt in self.pt_list:
            if self.points:
                print(f'\nPoint {pt} = {self.points_df[pt]}:')
            else:
                print(f'\nPoint {pt}:')
            nn = self.proximity_matrix.loc[pt].sort_values(ascending=True)[1:self.k+1]
            print(f'\nNearest {self.k} neighbours are {nn.index.values}')
            s1 = ''
            s2 = ''
            sum_s = 0
            for i,n in enumerate(nn):
                s1 += f'lrd({nn.index.values[i]}, {self.k})/{self.k} +'
                s2+=f'{density_dict[nn.index.values[i]]}/{self.k} +'
                sum_s +=density_dict[nn.index.values[i]]/self.k
            s1 = s1[:-1]
            s2 = s2[:-1]
            ard = round(density_dict[pt]/sum_s, self.rounding_digit)
            lof = ard
            if lof < min_lof:
                min_lof = lof
                min_pt = pt
            display(Eq(Symbol(f'ARD({pt},{self.k})'), Symbol(f'lrd({pt}, {self.k})')/Symbol(s1)))
            display(Eq(Symbol(f'ARD({pt},{self.k})'), Symbol(f'{density_dict[pt]}')/Symbol(s2)))
            #display(Eq(Symbol(f'density({pt},{self.k})'), Symbol(f'{sum_s}')/Symbol(f'{self.k}')))
            display(Eq(Symbol(f'ARD({pt},{self.k})'), ard))
            display(Symbol(f'LOF({pt}) = ARD({pt},{self.k}) = {lof}'))
            print('\n----------------------------------------------------------')
        print(f'\nLOF for point {min_pt} is the lowest, so it is termed as outlier')
        
    

## Proximity based outlier detection

In [138]:
points = [(1,1),(2,2),(4,5),(3,7),(2,6)]
pm = [
    [0,1,4,5,7],
    [1,0,2,6,8],
    [4,2,0,3,4],
    [5,6,3,0,4],
    [7,8,4,4,0]
]
pt_list = ['A', 'B', 'C', 'D', 'E']

#OutlierDetector(points=points).detect(method='proximity-based', k=2)

OutlierDetector(proximity_matrix=pm, pt_list=pt_list).detect(method='proximity-based', k=2)



Proximity based outlier detection


Unnamed: 0,A,B,C,D,E
A,0,1,4,5,7
B,1,0,2,6,8
C,4,2,0,3,4
D,5,6,3,0,4
E,7,8,4,4,0



k = 2

Outlier score, OS for a point p is given by:


Eq(OS(p), d(p, x_{1}) + d(p, x_{2})+...+d(p, x_{k})/k)


Point A:

Nearest 2 neighbours are ['B' 'C']


Eq(OS(A), 1 +4 /2)

Eq(OS(A), 5/2)

Eq(OS(A), 2.5)


----------------------------------------------------------

Point B:

Nearest 2 neighbours are ['A' 'C']


Eq(OS(B), 1 +2 /2)

Eq(OS(B), 3/2)

Eq(OS(B), 1.5)


----------------------------------------------------------

Point C:

Nearest 2 neighbours are ['B' 'D']


Eq(OS(C), 2 +3 /2)

Eq(OS(C), 5/2)

Eq(OS(C), 2.5)


----------------------------------------------------------

Point D:

Nearest 2 neighbours are ['C' 'E']


Eq(OS(D), 3 +4 /2)

Eq(OS(D), 7/2)

Eq(OS(D), 3.5)


----------------------------------------------------------

Point E:

Nearest 2 neighbours are ['C' 'D']


Eq(OS(E), 4 +4 /2)

Eq(OS(E), 8/2)

Eq(OS(E), 4.0)


----------------------------------------------------------

Outlier Score for point E is the highest, so it is termed as outlier


## Average Relative Density based outlier detection

In [139]:
OutlierDetector(proximity_matrix=pm, pt_list=pt_list, rounding_digit=3).detect(method='ard-based', k=3)


Average Relative Density based outlier detection


Unnamed: 0,A,B,C,D,E
A,0,1,4,5,7
B,1,0,2,6,8
C,4,2,0,3,4
D,5,6,3,0,4
E,7,8,4,4,0



k = 3

For all points x, determine local reachability density, lrd(x,k) for k-nearest neighbours using formula



Eq(lrd(x,k), |N(x,k)|/Σ_{y∈N(x,k)}dist(x,y))


Point A:

Nearest 3 neighbours are ['B' 'C' 'D']


Eq(lrd(A,3), 3/dist(A,B) +dist(A,C) +dist(A,D) )

Eq(lrd(A,3), 3/1 +4 +5 )

Eq(lrd(A,3), 3/10)

Eq(lrd(A,3), 0.3)


----------------------------------------------------------

Point B:

Nearest 3 neighbours are ['A' 'C' 'D']


Eq(lrd(B,3), 3/dist(B,A) +dist(B,C) +dist(B,D) )

Eq(lrd(B,3), 3/1 +2 +6 )

Eq(lrd(B,3), 3/9)

Eq(lrd(B,3), 0.333)


----------------------------------------------------------

Point C:

Nearest 3 neighbours are ['B' 'D' 'A']


Eq(lrd(C,3), 3/dist(C,B) +dist(C,D) +dist(C,A) )

Eq(lrd(C,3), 3/2 +3 +4 )

Eq(lrd(C,3), 3/9)

Eq(lrd(C,3), 0.333)


----------------------------------------------------------

Point D:

Nearest 3 neighbours are ['C' 'E' 'A']


Eq(lrd(D,3), 3/dist(D,C) +dist(D,E) +dist(D,A) )

Eq(lrd(D,3), 3/3 +4 +5 )

Eq(lrd(D,3), 3/12)

Eq(lrd(D,3), 0.25)


----------------------------------------------------------

Point E:

Nearest 3 neighbours are ['C' 'D' 'A']


Eq(lrd(E,3), 3/dist(E,C) +dist(E,D) +dist(E,A) )

Eq(lrd(E,3), 3/4 +4 +7 )

Eq(lrd(E,3), 3/15)

Eq(lrd(E,3), 0.2)


----------------------------------------------------------


For all points x, determine Local Outlier Factor LOF(x,k) using Average relative density, ARD(x,k), given by:


Eq(ARD(x,k), lrd(x,k)/Σ_{y∈N(x,k)}lrd(y,k) / |N(x,k)|)


Point A:

Nearest 3 neighbours are ['B' 'C' 'D']


Eq(ARD(A,3), lrd(A, 3)/lrd(B, 3)/3 +lrd(C, 3)/3 +lrd(D, 3)/3 )

Eq(ARD(A,3), 0.3/0.333/3 +0.333/3 +0.25/3 )

Eq(ARD(A,3), 0.983)

LOF(A) = ARD(A,3) = 0.983


----------------------------------------------------------

Point B:

Nearest 3 neighbours are ['A' 'C' 'D']


Eq(ARD(B,3), lrd(B, 3)/lrd(A, 3)/3 +lrd(C, 3)/3 +lrd(D, 3)/3 )

Eq(ARD(B,3), 0.333/0.3/3 +0.333/3 +0.25/3 )

Eq(ARD(B,3), 1.131)

LOF(B) = ARD(B,3) = 1.131


----------------------------------------------------------

Point C:

Nearest 3 neighbours are ['B' 'D' 'A']


Eq(ARD(C,3), lrd(C, 3)/lrd(B, 3)/3 +lrd(D, 3)/3 +lrd(A, 3)/3 )

Eq(ARD(C,3), 0.333/0.333/3 +0.25/3 +0.3/3 )

Eq(ARD(C,3), 1.131)

LOF(C) = ARD(C,3) = 1.131


----------------------------------------------------------

Point D:

Nearest 3 neighbours are ['C' 'E' 'A']


Eq(ARD(D,3), lrd(D, 3)/lrd(C, 3)/3 +lrd(E, 3)/3 +lrd(A, 3)/3 )

Eq(ARD(D,3), 0.25/0.333/3 +0.2/3 +0.3/3 )

Eq(ARD(D,3), 0.9)

LOF(D) = ARD(D,3) = 0.9


----------------------------------------------------------

Point E:

Nearest 3 neighbours are ['C' 'D' 'A']


Eq(ARD(E,3), lrd(E, 3)/lrd(C, 3)/3 +lrd(D, 3)/3 +lrd(A, 3)/3 )

Eq(ARD(E,3), 0.2/0.333/3 +0.25/3 +0.3/3 )

Eq(ARD(E,3), 0.68)

LOF(E) = ARD(E,3) = 0.68


----------------------------------------------------------

LOF for point E is the lowest, so it is termed as outlier
