In [354]:
from evolutionary_outliers_search import EvolutionaryOutliersSearch
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
import math
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [618]:
data = load_boston()
X, y, columns = data['data'], data['target'], data['feature_names']
df = pd.DataFrame(data=X, columns=columns)
df.drop(columns=['CHAS'], inplace=True)
df2 = pd.DataFrame(data=np.concatenate([X, y.reshape((-1,1))], axis = 1), columns=list(columns) + ["target"])
records_number, features_number = df.shape
df.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [575]:
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [627]:
s = -3 # significance level for sparsity test for p-value 99.9%
f = 3 # number of intervals for each dimension
k = np.floor(np.log(1 + records_number / s ** 2) / np.log(f)) # dimensionality 
k

3.0

In [644]:
ev_search = EvolutionaryOutliersSearch(results_number=10, dimensionality=k, p=20, p1=0.1, p2=0.1, f=f)
outliers_indexes, sparsity_coefs = ev_search(df.values)

In [637]:
df2.iloc[outliers_indexes]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
61,0.17171,25.0,5.13,0.0,0.453,5.966,93.4,6.8185,8.0,284.0,19.7,378.08,14.44,16.0
221,0.40771,0.0,6.2,1.0,0.507,6.164,91.3,3.048,8.0,307.0,17.4,395.24,21.46,21.7
130,0.34006,0.0,21.89,0.0,0.624,6.458,98.9,2.1185,4.0,437.0,21.2,395.04,12.6,19.2
382,9.18702,0.0,18.1,0.0,0.7,5.536,100.0,1.5804,24.0,666.0,20.2,396.9,23.6,11.3
141,1.62864,0.0,21.89,0.0,0.624,5.019,100.0,1.4394,4.0,437.0,21.2,396.9,34.41,14.4
137,0.35233,0.0,21.89,0.0,0.624,6.454,98.4,1.8498,4.0,437.0,21.2,394.08,14.59,17.1
228,0.29819,0.0,6.2,0.0,0.504,7.686,17.0,3.3751,8.0,307.0,17.4,377.51,3.92,46.7
229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5
303,0.1,34.0,6.09,0.0,0.433,6.982,17.7,5.4917,7.0,329.0,16.1,390.43,4.86,33.1
302,0.09266,34.0,6.09,0.0,0.433,6.495,18.4,5.4917,7.0,329.0,16.1,383.61,8.67,26.4


In [605]:
df2.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [647]:
max_points_number = 3
prob = 1 / f ** k
print("Sparcity coeficient for {} point - {}".format(max_points_number,( max_points_number - df.shape[0] * prob) / np.sqrt(df.shape[0] * prob * (1 - prob))))
print("Sparcity coeficients for hypercubes where outliers were found", sparsity_coefs)

Sparcity coeficient for 3 point - -3.7053308954389528
Sparcity coeficients for hypercubes where outliers were found [-3.94072839 -4.17612588 -3.94072839 -3.94072839 -3.94072839 -3.94072839
 -3.94072839 -3.94072839 -3.7053309  -3.7053309 ]


## Conclusion
Each point detected as outlier comes from a hypercube which contains at most 3 points. 
Some of outliers that we found, for example with id 61 and 141, was also mentioned in article. The reasons why we consider them as outliers can be found in the paper. Other point need further investigation to find the reasons why the algorithm considers them as an outliers. 
Samples with ids 302 and 303 have the same reasons to be outliers, because they have low nitric oxide concentration and high index of accessibility to radial highways.