In [1]:
import sys
sys.path.insert(0,'../')
import pymce as mce

import time
import numpy as np
import glob
import h5py
import scipy.io as sio

from sklearn import neighbors 
from sklearn.ensemble import IsolationForest
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA,TruncatedSVD,NMF,FastICA,KernelPCA,IncrementalPCA
from sklearn.metrics import roc_auc_score

import matplotlib.pylab as plt
from matplotlib import gridspec
%matplotlib inline

In [5]:
agg = AgglomerativeClustering(n_clusters=2,
            affinity='euclidean', connectivity=None,
            compute_full_tree='auto', linkage='ward')

def clustering(z_mu):
    num = z_mu.shape[0]
    n_divide = int(num/4999)+1
    y = np.zeros(num)
    for inds in np.array_split(np.arange(num), n_divide):
        y[inds] = agg.fit_predict(z_mu[inds,:])
    return y

metrics = mce.metrics

In [None]:
import os
fils = sorted(glob.glob('../../data/real/*.mat'), key=os.path.getsize)
fils = fils[:10]
# fils= ['../../data/real/glass.mat']
# fils = ['../../data/real/arrhythmia.mat']

n_try = 3
# i=0
for i in range(len(fils)):
# for i in range(1):
    name = fils[i].split('/')[-1]
    name = name[:-4]
    try:
        data = sio.loadmat(fils[i])
        X = data['X'].astype(float)
        y = data['y']
    except:
        data = h5py.File(fils[i])
        X = np.array(data['X']).T
        y = np.array(data['y']).T
    
    n_ftrs = X.shape[1]

    nn_en = [n_ftrs, n_ftrs//2, 2]
    nn_de = [2, n_ftrs//2, n_ftrs]
    network_architecture = [nn_en,nn_de]

    splitter = mce.Splitter(X, 'VAE', network_architecture, clustering)

    splitter.split(1,verbose=0,training_epochs=20)
    outliers = mce.outliers(X,splitter,metrics)
    
    ens_outliers = mce.ensemble_outliers(outliers)
    ens_outliers2 = ensemble_outliers(outliers)
    s2 = roc_auc_score(y==1, ens_outliers)
    s3 = roc_auc_score(y==1, ens_outliers2)    
    
    outliers = corrector(outliers)
    ens_outliers = mce.ensemble_outliers(outliers)
    ens_outliers2 = ensemble_outliers(outliers)
    s4 = roc_auc_score(y==1, ens_outliers)
    s5 = roc_auc_score(y==1, ens_outliers2)
    
    lof = neighbors.LocalOutlierFactor(n_neighbors=10)
    lof.fit(X)    
    s1 = roc_auc_score(y==1, -lof.negative_outlier_factor_)

    
    print 'name: {}, LOF: {:4.2f}, 1: {:4.2f}, 2: {:4.2f}, 3: {:4.2f}, 4: {:4.2f}'.format(name,s1,s2,s3,s4,s5)

name: lympho, LOF: 0.98, 1: 0.52, 2: 0.52, 3: 0.60, 4: 0.58
name: breastw, LOF: 0.41, 1: 1.00, 2: 1.00, 3: 1.00, 4: 1.00
name: wine, LOF: 0.94, 1: 0.99, 2: 0.99, 3: 0.99, 4: 0.92
name: vertebral, LOF: 0.49, 1: 0.39, 2: 0.41, 3: 0.44, 4: 0.43
name: glass, LOF: 0.78, 1: 0.92, 2: 0.92, 3: 0.58, 4: 0.45


In [232]:
for metr in metrics:
    print metr,roc_auc_score(y==1, outliers[metr])

cityblock 0.95479950563
L2 0.957281653392
L4 0.952258994782
braycurtis 0.445859653941
canberra 0.313708459215
chebyshev 0.93598770942
correlation 0.410028151607
mahalanobis 0.663475006866
wL2 0.955070722329
wL4 0.946951386982


In [34]:
ens_outliers = mce.ensemble_outliers(outliers)
print roc_auc_score(y==1, ens_outliers)

ens_outliers = ensemble_outliers(outliers)
print roc_auc_score(y==1, ens_outliers)

0.430158730159
0.438571428571


In [7]:
from sklearn import neighbors 
lof = neighbors.LocalOutlierFactor(n_neighbors=10)
lof.fit(X)
print roc_auc_score(y==1, -lof.negative_outlier_factor_)

0.731865284974


In [None]:
# for i,metr in enumerate(metrics):
for i,metr in enumerate(['canberra']):
    nn = outliers[metr].shape[0]
    plt.scatter(np.arange(nn),outliers[metr],c=mce.COLORS[i],edgecolors='none',label=metr)
plt.legend(bbox_to_anchor=(1.5, 1.05))

In [110]:
def corrector(outliers):

    assert isinstance(outliers, dict),'Input should be a dictionary contains outliers using a several metrics.'	
    n = 0
    sigma = 0
    for metr,dist in outliers.iteritems():
        nd = dist.shape[0]
        dist -= dist.min()
        dist /= dist.max()
        if (dist<0.5).sum()<nd/2:
            outliers[metr] = 1-dist
    return outliers
        

In [237]:
from scipy.spatial.distance import braycurtis,canberra,chebyshev,cityblock,correlation,minkowski,wminkowski

def ensemble_outliers(outliers):

    assert isinstance(outliers, dict),'Input should be a dictionary contains outliers using a several metrics.'	
    
#     n = 0
#     sigma = 0
#     for key in outliers.keys():
#         n += 1
#         distance = outliers[key]
#         distance -= distance.min()
#         distance /= distance.max()
#         std = (1.1)**(scipy.stats.skew(distance))
#         sigma = sigma+std
#         if n==1:
#                 ens_outliers = np.zeros(distance.shape)
#         ens_outliers += distance/std
    x = dic2array(outliers)
    x = x.view((float, len(x.dtype.names)))
    return np.sum(np.power(x,0.25),axis=1)


In [67]:
ens_outliers2 = ensemble_outliers(outliers)
print roc_auc_score(y==1, ens_outliers2)

0.921329991761


In [213]:
outliers['L2'].shape

(214,)

In [217]:
def dic2array(x):
    vals = []
    flds = []
    for k,v in x.iteritems():
        vals.append(v)
        flds.append((k,float))
    vals = np.array(vals).T
    vals = [tuple(i) for i in vals]
    return np.array(vals,dtype=flds)

In [181]:
x = outliers
np.array(x.values(),dtype=[(i,float) for i in x.keys()])['L2'].shape

(10, 214)

In [158]:
[(i,float) for i in x.keys()]

[('chebyshev', float),
 ('canberra', float),
 ('wL4', float),
 ('wL2', float),
 ('mahalanobis', float),
 ('braycurtis', float),
 ('L4', float),
 ('L2', float),
 ('correlation', float),
 ('cityblock', float)]

In [179]:
x = np.array([(1,2),(3,4)],dtype=[('a','<i4'),('b','<i4')])
x['a']

array([1, 3], dtype=int32)