In [None]:
# reference: https://towardsdatascience.com/multivariate-outlier-detection-in-python-e946cfc843b3

In [42]:
import pandas as pd
import scipy as sp
from scipy import linalg
import numpy as np

df1 = pd.read_csv('/home/ukjung18/project/rel.csv', header=None)
df2 = pd.read_csv('/home/ukjung18/project/wf.csv', header=None)
df3 = pd.read_csv('/home/ukjung18/project/gh.csv', header=None)
df = pd.concat([df1[1:],df2[1:],df3[1:]])
df.columns = ['기관명', 'WGS84위도', 'WGS84경도']

In [43]:
df = df.reset_index(drop=True)

In [44]:
def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()

In [45]:
df_sub = df[['WGS84위도', 'WGS84경도']]
df_sub = df_sub.astype(float)
df_sub['mahala'] = mahalanobis(x=df_sub, data=df_sub)

In [46]:
df_sub

Unnamed: 0,WGS84위도,WGS84경도,mahala
0,37.660655,126.830347,1.135313
1,37.660655,126.830347,1.135313
2,37.459406,126.876809,0.333484
3,37.447650,126.884979,0.306224
4,37.591796,127.141059,1.010550
...,...,...,...
427,37.210062,127.008030,1.353923
428,37.207012,127.036257,1.370697
429,37.216545,127.078499,1.327415
430,37.216496,127.079942,1.331294


In [47]:
from scipy.stats import chi2
chi2.ppf((1-0.01), df=2)

9.21034037197618

In [48]:
# Compute the P-Values
df_sub['p_value'] = 1 - chi2.cdf(df_sub['mahala'], 2)

# Extreme values with a significance level of 0.01
df_sub.loc[df_sub.p_value < 0.01]

Unnamed: 0,WGS84위도,WGS84경도,mahala,p_value
52,37.209534,127.642669,10.568128,0.005072
55,38.054524,127.386184,12.717036,0.001732
305,37.544094,127.620141,10.310528,0.005769
306,37.55726,127.780777,16.186967,0.000306
310,37.301482,127.625317,9.711347,0.007784
311,37.325586,127.668621,11.038884,0.004008


In [53]:
anomaly = df.loc[[52, 55, 305, 306, 310, 311]]

In [51]:
new_df = df.drop([52, 55, 305, 306, 310, 311])

In [52]:
new_df.to_csv("disabledFacility.csv", mode='w', index=False)

In [55]:
anomaly.to_csv("anomaly.csv", mode='w', index=False)