In [57]:
import numpy as np
import matplotlib.pylab as plt
from matplotlib import gridspec
import glob
import scipy.io as sio
import h5py
import sys
from sklearn.decomposition import PCA,TruncatedSVD,NMF,FastICA,KernelPCA,IncrementalPCA
from sklearn import neighbors 
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest
import pickle
import time
from joblib import Parallel, delayed
sys.path.insert(0,'../')
import outlier_finder as of
import pandas as pd

%matplotlib inline

In [2]:
n_sig = 1
num = 3000
n_out = 50

sigma = 0.03
n1 = 0.02
n2 = 0.01
n3 = 0.02
n4 = 0.01
n_ftrs = 100
x = np.linspace(0,1,n_ftrs)

X_train = []
Y_train = []
for i in range(n_sig):
    for j in range(num):
        X_train.append(of.signal(i+1,x,sigma,n1,n2,n3,n4))
        Y_train.append(i)
        
    for j in range(n_out):
        sig = of.signal(i+1,x,sigma,n1,n2,n3,n4)
        sig = of.event_sig(sig)
        X_train.append(sig)
        Y_train.append(i+n_sig)
        
X_train = np.array(X_train).astype(np.float16)
Y_train = np.array(Y_train).astype(int)

In [3]:
agg_source = of.AgglomerativeClustering(n_clusters=2,
                affinity='euclidean', connectivity=None,
                compute_full_tree='auto', linkage='ward')

def splitter(z_mu):
    num = z_mu.shape[0]
    n_divide = int(num/4999)+1
    y = np.zeros(num)
    for inds in np.array_split(np.arange(num), n_divide):
        y[inds] = agg_source.fit_predict(z_mu[inds,:])
    return y

metrics = ['cityblock','L2','L4','expL4','braycurtis',
           'canberra','chebyshev','correlation','mahalanobis',
           'wL2','wL4','wexpL4']

def outliers(X_test,dc,metrics):
    nmetrics = ['cityblock','L2','L4','expL4','braycurtis',
           'canberra','chebyshev','correlation']
    wmetrics = ['wL2','wL4','wexpL4']
    
    n_test = X_test.shape[0]
    n_ftrs = X_test.shape[1]
    out_ind = {}
    
    mean_vec = dc.mean_vector[-1].reshape(-1,n_ftrs)
    cov = dc.covariance[-1].reshape(-1,n_ftrs,n_ftrs)
    w = np.array([1./np.sqrt(of.clip(np.diag(cv),a_min=1e-4)) for cv in cov])
    dense_components = dc.dense_components[-1].reshape(-1,n_ftrs)
    mean_components = dc.mean_components[-1].reshape(-1,n_ftrs)
    components = dense_components
    
    if 'mahalanobis' in metrics:
        nc = cov.shape[0]
        cov_inv = np.zeros(cov.shape)
        for i in range(nc):
            cov_inv[i] = np.linalg.pinv(cov[i])

        distance_test = np.zeros(n_test)
        for i in range(n_test):
            dst_arr = np.zeros(nc)
            for j in range(nc):
                vec = (X_test[i]-mean_vec[j]).reshape(n_ftrs,1)
                dst_arr[j] = np.matmul(np.matmul(vec.T,cov_inv[j]),vec)

            distance_test[i] = np.min(dst_arr)

        distance_test[i] = np.min(dst_arr)
        out_ind['mahalanobis'] = np.argsort(distance_test)[::-1]
            
    for metric in metrics:
        if metric in nmetrics:
            distance_test = of.dist(metric,np.array(components),X_test)
            out_ind[metric] = np.argsort(distance_test)[::-1]
        elif metric in wmetrics:
            distance_test = of.dist(metric,np.array(components),X_test,w)
            out_ind[metric] = np.argsort(distance_test)[::-1]
    
    return out_ind

def quick_outlier_analysis(X,y,ns):
    
    out = {}
    n_data,n_ftrs = X.shape
    n_out = (y==1).sum()
    nn_en = [n_ftrs, n_ftrs//2, 2]
    nn_de = [2, n_ftrs//2, n_ftrs]
    network_architecture = [nn_en,nn_de]
    out['n_data'] = n_data
    out['n_out'] = n_out

    dim_rs ={'AE':'AE','VAE':'VAE', 
             'PCA':PCA(n_components=2),
             'NMF':NMF(n_components=2), 
             'FastICA':FastICA(n_components=2)}
    
    for dim_r, value in dim_rs.iteritems():
        if dim_r=='VAE' or dim_r=='AE':
            dc = of.decomposer(X, value, network_architecture, splitter)
        else:
            dc = of.decomposer_gen(X, of.sk_convert(value), splitter)
    
        for i in range(ns):
            print 'ROUND '+str(i)
            if dim_r=='VAE' or dim_r=='AE':
                dc.split(1,verbose=True,training_epochs=20)
            else:
                dc.split(1,verbose=True)
                
            out_ind = outliers(X,dc,metrics)
            for metric in metrics:
                t_out = y==1
                src = of.score(out_ind[metric][:n_out],t_out)
                out['recall_'+dim_r+'_'+metric+'_'+str(i)] = src[n_out-1]
                out['auc_'+dim_r+'_'+metric+'_'+str(i)] = roc_auc_score(t_out, of.ind2score(out_ind[metric]))
                
    for nn in [5,10,35]:
        lof = neighbors.LocalOutlierFactor(n_neighbors=nn, contamination=(1.*n_out)/n_data)
        lof.fit(X)
        scores_pred = lof.negative_outlier_factor_
        winds = np.argsort(scores_pred)
        src = of.score(winds[:n_out],t_out)
        out['recall_LOF'+str(nn)] = src[n_out-1]
        out['auc_LOF'+str(nn)] = roc_auc_score(t_out, of.ind2score(winds))
    
    isof = IsolationForest(max_samples='auto', contamination=(1.*n_out)/n_data)
    isof.fit(X)
    scores_pred = isof.decision_function(X)
    winds = np.argsort(scores_pred)
    src = of.score(winds[:n_out],t_out)
    out['recall_isof'] = src[n_out-1]
    out['auc_isof'] = roc_auc_score(t_out, of.ind2score(winds))
    
    return out

def pklread(name):
    with open(name,'r') as f:
        data = pickle.load(f)
    return data

In [37]:
nn_en = [n_ftrs, n_ftrs//2, 2]
nn_de = [2, n_ftrs//2, n_ftrs]
network_architecture = [nn_en,nn_de]

dim_r ={'TSVD':TruncatedSVD(n_components=2), 'PCA':PCA(n_components=2)
       ,'NMF':NMF(n_components=2), 'FastICA':FastICA(n_components=2), 
       'KPCA':KernelPCA(n_components=2),'IPCA':IncrementalPCA(n_components=2)}

In [172]:
fils = glob.glob('../data/real/*.mat')
fils = ['../data/real/arrhythmia.mat']
def job(X,y,n_t):
    return quick_outlier_analysis(X,y,3)

n_try = 3
# i=0
for i in range(len(fils)):
# for i in range(1):
    print fils[i]
    try:
        data = sio.loadmat(fils[i])
        X = data['X'].astype(float)
        y = data['y']
    except:
        data = h5py.File(fils[i])
        X = np.array(data['X']).T
        y = np.array(data['y']).T
        
    print X.shape
        
#     xxx= job(X,y,1)
        
#     b_time = time.time()
    
#     out = Parallel(n_jobs=int(3)) (delayed(job)(X,y,n_t) for n_t in range(n_try))  
            
#     print "Finished in", time.time()-b_time , "sec"
        
#     add = fils[i][:-4].split('/')[-1]
#     with open(add+'.pkl', 'wb') as f:
#         pickle.dump(out, f)
        
#     print '-------------'


../data/real/arrhythmia.mat
(452, 274)


In [4]:
def synt_event(i_sig):
    main_data = {i_sig:1000}
    event_data = {i_sig:50}
    sigma = 0.05
    n1 = 0.02
    n2 = 0.01
    n3 = 0.02
    n4 = 0.01

    n_ftrs = 200
    x = np.linspace(0,1,n_ftrs)

    X = []
    y = []
    for key,value in main_data.iteritems():
        for _ in range(value):
            Xp = of.signal(key,x,sigma,n1,n2,n3,n4)
            X.append(Xp)
            y.append(key)

    for key,value in event_data.iteritems():
        for _ in range(value):
            Xp = of.signal(key,x,sigma,n1,n2,n3,n4)
            Xp = of.event_sig(Xp)
            X.append(Xp)
            y.append(key+10)
            
    return np.array(X),np.array(y)

def synt_mix(i_sig):
    main_data = {i_sig:3000}
    sigma = 0.05
    n1 = 0.02
    n2 = 0.01
    n3 = 0.02
    n4 = 0.01

    n_ftrs = 200
    x = np.linspace(0,1,n_ftrs)

    X = []
    y = []
    for key,value in main_data.iteritems():
        for _ in range(value):
            Xp = of.signal(key,x,sigma,n1,n2,n3,n4)
            X.append(Xp)
            y.append(key)

    for i in range(10):
        if i!=i_sig:
            for j in range(30):
                Xp = of.signal(key,x,sigma,n1,n2,n3,n4)
                X.append(Xp)
                y.append(key+10)
            
    return np.array(X),np.array(y)

In [177]:
# n_try = 3
# n_core = 3
# for i in range(1,2):
#     X, y = synt_event(i)
#     b_time = time.time()   
#     out = Parallel(n_jobs=int(n_core)) (delayed(job)(X,y,n_t) for n_t in range(n_try))          
#     print "Finished in", time.time()-b_time , "sec"
#     with open(str(i)+'_event.pkl', 'wb') as f:
#         pickle.dump(out, f)
#     np.save('X_'+str(i)+'_event',X)
#     np.save('y_'+str(i)+'_event',y)
        
# for i in range(1,2):
#     X, y = synt_mix(i)
#     b_time = time.time()   
#     out = Parallel(n_jobs=int(n_core)) (delayed(job)(X,y,n_t) for n_t in range(n_try))          
#     print "Finished in", time.time()-b_time , "sec"
#     with open(str(i)+'_mix.pkl', 'wb') as f:
#         pickle.dump(out, f)
#     np.save('X_'+str(i)+'_mix',X)
#     np.save('y_'+str(i)+'_mix',y)

In [6]:
def synt_unbalanced():
    train_data = {1:2000,2:2000,3:2000,4:2000,5:50,6:50}
    test_data = {1:2000,2:2000,3:2000,4:2000,5:50,6:50,7:50,8:50,9:50,10:50}
    sigma = 0.05
    n1 = 0.02
    n2 = 0.01
    n3 = 0.02
    n4 = 0.01

    n_ftrs = 200
    x = np.linspace(0,1,n_ftrs)

    X = []
    y = []
    for key,value in train_data.iteritems():
        for _ in range(value):
            Xp = of.signal(key,x,sigma,n1,n2,n3,n4)
            X.append(Xp)
            y.append(key)
    X_train = np.array(X)
    y_train = np.array(y)
    
    for key,value in test_data.iteritems():
        for _ in range(value):
            Xp = of.signal(key,x,sigma,n1,n2,n3,n4)
            X.append(Xp)
            y.append(key)
    X_test = np.array(X)
    y_test = np.array(y)
    
    return X_train,y_train,X_test,y_test 

def quick_outlier_analysis(X_train,y_train,X_test,y_test,ns):
    
    out = {}
    n_data,n_ftrs = X_train.shape
    n_out = ((y_test==7) | (y_test==8) |(y_test==9) |(y_test==10)).sum()
    nn_en = [n_ftrs, n_ftrs//2, 2]
    nn_de = [2, n_ftrs//2, n_ftrs]
    network_architecture = [nn_en,nn_de]
    out['n_data'] = n_data
    out['n_out'] = n_out

    dim_rs ={'AE':'AE','VAE':'VAE', 
             'PCA':PCA(n_components=2),
             'NMF':NMF(n_components=2), 
             'FastICA':FastICA(n_components=2)}
    
    for dim_r, value in dim_rs.iteritems():
        if dim_r=='VAE' or dim_r=='AE':
            dc = of.decomposer(X_train, value, network_architecture, splitter)
        else:
            dc = of.decomposer_gen(X_train, of.sk_convert(value), splitter)
    
        for i in range(ns):
            print 'ROUND '+str(i)
            if dim_r=='VAE' or dim_r=='AE':
                dc.split(1,verbose=True,training_epochs=20)
            else:
                dc.split(1,verbose=True)
                
            out_ind = outliers(X_test,dc,metrics)
            for metric in metrics:
                t_out = ((y_test==7) | (y_test==8) |(y_test==9) |(y_test==10))
                src = of.score(out_ind[metric][:n_out],t_out)
                out['recall_'+dim_r+'_'+metric+'_'+str(i)] = src[n_out-1]
                out['auc_'+dim_r+'_'+metric+'_'+str(i)] = roc_auc_score(t_out, of.ind2score(out_ind[metric]))
                
    for nn in [5,10,35]:
        lof = neighbors.LocalOutlierFactor(n_neighbors=nn, contamination=(1.*n_out)/n_data)
        lof.fit(X_test)
        scores_pred = lof.negative_outlier_factor_
        winds = np.argsort(scores_pred)
        src = of.score(winds[:n_out],t_out)
        out['recall_LOF'+str(nn)] = src[n_out-1]
        out['auc_LOF'+str(nn)] = roc_auc_score(t_out, of.ind2score(winds))
    
    isof = IsolationForest(max_samples='auto', contamination=(1.*n_out)/n_data)
    isof.fit(X_train)
    scores_pred = isof.decision_function(X_test)
    winds = np.argsort(scores_pred)
    src = of.score(winds[:n_out],t_out)
    out['recall_isof'] = src[n_out-1]
    out['auc_isof'] = roc_auc_score(t_out, of.ind2score(winds))
    
    return out

# def job(X,y,n_t):
#     return quick_outlier_analysis(X,y,3)

X_train,y_train,X_test,y_test = synt_unbalanced()
out = quick_outlier_analysis(X_train,y_train,X_test,y_test,1)

ROUND 0
Split level: 1
ROUND 0
Split level: 1
ROUND 0
Split level: 1
ROUND 0
Split level: 1
Epoch: 20, cost= 0.155176
ROUND 0
Split level: 1
Epoch: 20, cost= 117.607


In [12]:
train = pklread('../data/syn/train.pkl')
event = pklread('../data/syn/event.pkl')

In [16]:
print train['X'].shape
print event['X'].shape

(20000, 200)
(500, 200)


In [23]:
dim_reds = ['AE','VAE','PCA','NMF','FastICA']
dim_clrs = ['r','b','g','y','orange']
metrics = ['cityblock','L2','L4','expL4','braycurtis',
           'canberra','chebyshev','correlation','mahalanobis',
           'wL2','wL4','wexpL4']

def mean_dic(res,case,num):
    vr = []
    for i in range(num):
        vr.append(res[i][case])
    return np.mean(vr),np.std(vr)

def analyze_plot(res,sl,out):
    lbls = []
    values = []
    stds = []
    clrs = []

    for metr in metrics:
        for i,drm in enumerate(dim_reds):

            lbl = 'auc_'+drm+'_'+metr+'_'+str(sl)
            mm,ss = mean_dic(res,lbl,50)
            values.append(mm)
            stds.append(ss)
            clrs.append(dim_clrs[i])
            lbls.append(lbl.split('_')[-2])

            
    for nn in ['5','10','35']:
        lbls.append('LOF'+nn)
        mm,ss = mean_dic(res,'auc_LOF'+nn,50)
        values.append(mm)
        stds.append(ss)
        clrs.append('c')

    lbls.append('i-forest')
    mm,ss = mean_dic(res,'auc_isof',50)
    values.append(mm)
    stds.append(ss)
    clrs.append('c')
    
    # print res[0]['n_out']

    fig = plt.figure(figsize=(12, 5))
    gs = gridspec.GridSpec(1,1)
    ax = fig.add_subplot(gs[0])

    w_l = 0.5     # the width of the bars
    w_b = 0.74

    s0 = 0
    s1 = len(lbls)*w_l
    s2 = s1+w_b

    ind = reduce(np.append,(np.arange(s0,s1,w_l))) 
    ax.set_xticks(ind + w_l / 2)
    ax.set_xticklabels(lbls,rotation='vertical',fontsize=7)
    ax.set_ylabel('AUC',fontsize=10)
    
    p = ax.bar(ind+w_l/4., values, w_l/2, yerr=stds, color=clrs)
    
    ax.set_ylim(0,1)
    ax.grid(True, which='both')
    ax.legend((p[0], p[1], p[2], p[3], p[4], p[-1]), ('AE', 'VAE', 'PCA', 'NMF', 'ICA', 'Others'),
              bbox_to_anchor=(1.085, 1.022), fontsize=8)
    
    plt.subplots_adjust(bottom=0.15, top=0.98, left=0.05, right=0.92)
    plt.savefig(out+'.jpg',dpi=150)
    plt.close()
    
fils = glob.glob('../results/benchmark/*.pkl')

# for fil in fils:
#     for sl in range(3):
#         res = pklread(fil)
#         out = fil.split('/')[-1]
#         analyze_plot(res,sl,fil[:-4]+'_'+str(sl))
        
 

In [131]:
def labeler(lbls):
    lbls2 = [None for i in lbls]
    for i,s in enumerate(lbls):
        s = s.replace('auc_', '')
        s = s.replace('Fast', '')
#         s = s.replace('LOF5', 'LOF_5') 
#         s = s.replace('LOF10', 'LOF_10') 
#         s = s.replace('LOF35', 'LOF_35') 
        s = s.replace('_', '-')
        s = s.replace('cityblock', 'Cityblock')
#         s = s.replace('L2', 'L_2')
        s = s.replace('expL4', 'exp(L4)') 
#         s = s.replace('L4', 'L_4')  
#         s = s.replace('wL2', 'WL_2')
        s = s.replace('wexp(L_4)', 'exp(WL4)') 
#         s = s.replace('wL4', 'WL_4') 
        s = s.replace('braycurtis', 'Bray-Curtis')
        s = s.replace('canberra', 'Canberra')
        s = s.replace('chebyshev', 'Chebyshev')
        s = s.replace('correlation', 'Correlation')
        s = s.replace('mahalanobis', 'Mahalanobis')            
        lbls2[i] = r''+s+''
    return lbls2

metrics = ['cityblock','L2','L4','expL4','braycurtis',
           'canberra','chebyshev','correlation','mahalanobis',
           'wL2','wL4','wexpL4']

In [132]:
import os
fils = sorted(glob.glob('../results/benchmark/*.pkl'), key=os.path.getsize)

n_mth = 4
w_l = 0.5     # the width of the bars
w_b = 1.5

for nf in range(0,21,5):
    print nf
    
    lbl_p = []
    v_p = []
    s_p = []
    name_p = []
    clrs_p = []

    for fil in fils[nf:nf+5]:

        res = pklread(fil)
        lbls = []
        values = []
        stds = []

        name = fil.split('/')[-1]
        name_p.append(name[:-4])

        for metr in metrics:
            for i,drm in enumerate(dim_reds):
                for sl in range(3):
                    lbl = 'auc_'+drm+'_'+metr+'_'+str(sl)
                    lbls.append(lbl)
                    mm,ss = mean_dic(res,lbl,50)
                    values.append(mm)
                    stds.append(ss)

        sort_ind = np.argsort(np.array(values))[::-1]
        sort_ind = sort_ind[:n_mth]

        lbls = np.array(lbls)[sort_ind]
        values = np.array(values)[sort_ind]
        stds = np.array(stds)[sort_ind]


        mmm = 0
        for nn in ['5','10','35']:
            mm,ss = mean_dic(res,'auc_LOF'+nn,50)
            if mm>mmm:
                mmm=mm
                sss = ss
                nnn = nn

        lbls = np.append(lbls,'LOF'+nnn)
        values = np.append(values,mmm)
        stds = np.append(stds,sss)

        lbls = np.append(lbls,'i-forest')
        mm,ss = mean_dic(res,'auc_isof',50)
        values = np.append(values,mm)
        stds = np.append(stds,ss)

        clrs_p.append(['b']*n_mth+['r','g'])

        lbl_p.append(lbls)
        v_p.append(values)
        s_p.append(stds)

    lbl_p = list_flat(lbl_p)
    v_p = list_flat(v_p)
    s_p = list_flat(s_p)
    clrs_p = list_flat(clrs_p)

    fig = plt.figure(figsize=(12, 5))
    gs = gridspec.GridSpec(1,1)
    ax = fig.add_subplot(gs[0])

    ind = ind_pos(n_mth+2,5,w_l,w_b)
    ax.set_xticks(ind + w_l / 2)
    ax.set_xticklabels(labeler(lbl_p),rotation='vertical',fontsize=12)
    ax.set_ylabel('AUC',fontsize=15)
    plt.yticks(fontsize=12)

    for i,name in enumerate(name_p):
        ax.text((i+0.5)*(n_mth+1)*w_l+i*w_b-len(name)/12., 1.04, name, fontsize=18)

    p = ax.bar(ind+w_l/4., v_p, 2.*w_l/3, yerr=s_p, color=clrs_p
          , error_kw=dict(lw=3, capsize=5, capthick=3, ecolor='k'))

    ax.grid(True, which='both')
    ax.legend((p[0], p[n_mth], p[n_mth+1]), ('MCE', 'LOF', 'i-forest'),
              bbox_to_anchor=(1.135, 1.04), fontsize=12)

    ax.set_ylim(0,1)

    plt.subplots_adjust(bottom=0.35, top=0.92, left=0.06, right=0.89)
    plt.savefig('real_'+str(nf)+'.jpg',dpi=300)
    plt.close()

0
5
10
15
20


In [140]:
fils = sorted(glob.glob('../results/benchmark/*.pkl'), key=os.path.getsize)

columns = ['Dataset']+['PCA','ICA','NMF','AE','VAE','LOF','i-forest']
n_row = len(fils)
index = np.arange(n_row) # array of numbers for the number of samples
df = pd.DataFrame(columns=columns, index = index)

# lbl_p = []
# v_p = []
# s_p = []
# name_p = []
# clrs_p = []

for i,fil in enumerate(fils):

    res = pklread(fil)

    name = fil.split('/')[-1]
    df['Dataset'][i] = name[:-4]

    for j,drm in enumerate(dim_reds):
        mmm = 0
        for metr in metrics:
            for sl in range(3):
                lbl = 'auc_'+drm+'_'+metr+'_'+str(sl)
                mm,ss = mean_dic(res,lbl,50)
                if mm>mmm:
                    mmm=mm
                    sss = ss
                    nnn = nn  
        df[columns[j+1]][i] = tdigit(mmm,sss)

#     sort_ind = np.argsort(np.array(values))[::-1]
#     sort_ind = sort_ind[:n_mth]

#     lbls = np.array(lbls)[sort_ind]
#     values = np.array(values)[sort_ind]
#     stds = np.array(stds)[sort_ind]


    mmm = 0
    for nn in ['5','10','35']:
        mm,ss = mean_dic(res,'auc_LOF'+nn,50)
        if mm>mmm:
            mmm=mm
            sss = ss
            nnn = nn  
    df['LOF'][i] = tdigit(mmm,sss)

    mm,ss = mean_dic(res,'auc_isof',50)
    df['i-forest'][i] = tdigit(mm,ss)



# lbl_p = list_flat(lbl_p)
# v_p = list_flat(v_p)
# s_p = list_flat(s_p)
# clrs_p = list_flat(clrs_p)

table_gen(df,'real_data')

In [133]:
case = 'mix'
n_sig = 10

columns = ['DRO']+[str(i+1) for i in range(n_sig)]
dmrs = ['PCA','ICA','NMF','AE','VAE','LOF','i-forest']
n_row = len(dmrs)
index = np.arange(n_row) # array of numbers for the number of samples
df = pd.DataFrame(columns=columns, index = index)
for i in range(n_row):
    df['DRO'][i] = dmrs[i]
    
for i_sig in range(n_sig):
    sig = i_sig+1
    fils = glob.glob('../results/synthetic/'+str(sig)+'_'+'*'+case+'_3.pkl')

    for fil in fils:
        print fil
        res = pklread(fil)

    for i,drm in enumerate(dim_reds):  
        m_max = 0
        for metr in metrics:
            for sl in range(3):
                lbl = 'auc_'+drm+'_'+metr+'_'+str(sl)
                mm,ss = mean_dic(res,lbl,50)
                if mm>m_max:
                    m_max = mm
                    s_max = ss
                    lb_max = lbl


        df[str(sig)][i] = tdigit(m_max,s_max)

    mmm = 0
    for nn in ['5','10','35']:
        mm,ss = mean_dic(res,'auc_LOF'+nn,50)
        if mm>mmm:
            mmm=mm
            nnn = nn
            sss = ss

    df[str(sig)][5] = tdigit(mmm,sss)

    mm,ss = mean_dic(res,'auc_isof',50)
    df[str(sig)][6] = tdigit(mm,ss)
    
table_gen(df,case)

../results/synthetic/1_mix_3.pkl
../results/synthetic/2_mix_3.pkl
../results/synthetic/3_mix_3.pkl
../results/synthetic/4_mix_3.pkl
../results/synthetic/5_mix_3.pkl
../results/synthetic/6_mix_3.pkl
../results/synthetic/7_mix_3.pkl
../results/synthetic/8_mix_3.pkl
../results/synthetic/9_mix_3.pkl
../results/synthetic/10_mix_3.pkl


In [105]:
def table_gen(df,name):
    import subprocess
    tab = df.to_latex(escape=False, index=False)
    tab = r'''\documentclass[border=0.5in]{standalone}
\usepackage{booktabs}
\usepackage{pdflscape}
\usepackage[a4paper,bindingoffset=0.2in,%
        left=0.25in,right=0.25in,top=1in,bottom=1in,%
        footskip=.25in]{geometry}
\begin{document}
\begin{centering}
\pagenumbering{gobble}
\oddsidemargin = 0pt
\hoffset = -0.25in
\topmargin = 1pt
\headheight = 0pt
\headsep = 0pt
    '''+\
    tab+'''
\end{centering}
\end{document}
    '''
    f = open(name+'.tex','w') 
    f.write(tab) 
    f.close() 
    subprocess.call(['xelatex', name+'.tex'])
    subprocess.call(['rm', name+'.aux'])
    subprocess.call(['rm', name+'.log'])
#     subprocess.call(['rm', name+'.tex'])

def tdigit(x,e):
    if e<1e-2:
        return '${:.2g}$'.format(x)
    else:
        return '${:.2g}\\pm{:.1g}$'.format(x,e)

# supervised
metrics_sup = ['cityblock','L2','L4','expL4','braycurtis',
           'chebyshev','correlation','mahalanobis']

res = pklread('../results/synthetic/supervised_2.pkl')

columns = ['DRO']+metrics_sup
dmrs = ['PCA','ICA','NMF','AE','VAE','LOF','i-forest']
n_row = len(dmrs)
index = np.arange(n_row) # array of numbers for the number of samples
df = pd.DataFrame(columns=columns, index = index)
for i in range(n_row):
    df['DRO'][i] = dmrs[i]
    
for metr in metrics_sup:
    for i,drm in enumerate(dim_reds):
        m_max=0
        for sl in range(3):
            lbl = 'auc_'+drm+'_'+metr+'_'+str(sl)
            mm,ss = mean_dic(res,lbl,50)
            if mm>m_max:
                m_max = mm
                s_max = ss
            
        df[metr][i] = tdigit(m_max,s_max)

mmm = 0
for nn in ['5','10','35']:
    mm,ss = mean_dic(res,'auc_LOF'+nn,50)
    if mm>mmm:
        mmm=mm
        nnn = nn
        sss = ss
    
df['cityblock'][n_row-2] = tdigit(mmm,sss)

mm,ss = mean_dic(res,'auc_isof',50)
df['cityblock'][n_row-1] = tdigit(mm,ss)

table_gen(df,'supervised')

In [26]:
def ind_pos(nl,nc,w_l,w_b):
    out = []
    for i in range(nc):
        for j in range(nl):
            out.append(j*w_l+i*(nl-1)*w_l+i*w_b)
    return np.array(out)

def list_flat(lst):
    return np.array(lst).reshape(-1)

def ind2score(oi):
    num = oi.shape[0]
    score = np.zeros(num)
    score[oi[::-1]] = np.linspace(0,1,num)
    return score

array([ 0. ,  0.1,  0.2,  0.3,  0.4])