In [1]:
import pandas as pd
import numpy as np

import math
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.utils import resample

from tqdm import tqdm 


In [2]:
n_estimators = 10
max_samples = 0.5
select_data = 'letter'

In [3]:
if select_data == 'letter':
    # データの取得
    df = pd.read_csv('../data/letter_recognition.csv', header=None)

    # Aのみを判定するため，Aを0，A以外を1にした．
    # 少数派のAを正常，その他を異常データと定義
    df[0] = df[0].apply(lambda x: 0 if x == 'A' else 1)

    #Xとyを入力
    X = df[range(1,17)]
    y = df[0]

elif select_data == 'wine':

    import tensorflow as tf

    dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

    # ファイルのダウンロード
    dataset_path = tf.keras.utils.get_file('wine.data', dataset_url)

    print(dataset_path)

    column_names = ['Alcohol',
    'Malic acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline' 
    ]

    raw_data = pd.read_csv(dataset_path, names=column_names)
    raw_data['y'] = raw_data.index
    raw_data = raw_data.reset_index(drop=True)

    raw_data['y'] = raw_data['y'].apply(lambda x: 0 if x == 3 else 1)

    X = raw_data.drop('y', axis=1)
    y = raw_data['y']

else:
    print('そのデータはありません')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# 必要な関数の定義

# 共分散行列の逆行列
def inv_cov(Z):
    #標準化後のベクトルを入力する
    #標準化した後なので相関行列と分散共分散行列は一致する
    c = np.cov(Z.T)
    return np.linalg.inv(c)

#マハラノビス汎距離
def cal_MD(Z, inv_C):
    '''
    Z:標準化したベクトル
    inv_C:標準化後の共分散行列
    '''
    MD = []
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / Z.shape[1]
        MD.append(_MD)
    return MD

# MTSを実行
def fit_MTS(X, y):
    
    # 正常データのみを使用して標準化
    scaler = StandardScaler()
    scaler.fit(X[y == 0])
    normal_Z = scaler.transform(X[y == 0])
    anomaly_Z = scaler.transform(X[y == 1])

    # 正常データのみを使用して共分散行列を計算
    inv_C = inv_cov(normal_Z)

    # いったん飛ばす，削除の基準は？削除しない方法もあるっぽい？
        #１度目の仮のマハラノビス距離を計算
        # MD_1st = cal_MD(normal_Z, inv_C)
        # もしもマハラノビス距離が余りにも大きいサンプルがあれば任意で削除する
        # 削除後のデータを使用して標準化と共分散行列を計算

    # 異常データと直交表を用いてSN比を計算
    #L8直行表
    df_l8 = pd.DataFrame([[1,1,1,1,1,1,1],[1,1,1,2,2,2,2],[1,2,2,1,1,2,2],[1,2,2,2,2,1,1],[2,1,2,1,2,1,2],[2,1,2,2,1,2,1],[2,2,1,1,2,2,1],[2,2,1,2,1,1,2]])
    l8 = (df_l8==1).values

    #異常データのマハラノビス距離
    result = np.zeros((l8.shape[0], anomaly_Z.shape[0]))
    for x, l8_row in enumerate(l8):
        result[x] = cal_MD(anomaly_Z[:, l8_row], inv_C[l8_row][:,l8_row])

    #SN比
    sn = np.zeros(l8.shape[0])
    for idx, row in enumerate(result):
        sum_MD = 0
        for i in range(len(row)):
            sum_MD += 1 / row[i]
        sn[idx] = -10 * math.log10(sum_MD / len(row))
        
    # SN比を利用し，不要と思われる変数を削除する
    #変数選択
    df_sn = pd.DataFrame(index=X.columns, columns=['SN比','残す'])
    for i, clm in enumerate(X.columns):
        df_sn.loc[df_sn.index == clm, 'SN比'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]])
        df_sn.loc[df_sn.index == clm, '残す'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]]) > 0
    select_columns = df_sn[df_sn['残す']].index
    
    if len(select_columns) > 1:
        # 選択変数でのスケーラーと共分散行列を計算
        result_scaler = StandardScaler()
        result_scaler.fit(X[select_columns][y == 0])
        result_Z = result_scaler.transform(X[select_columns][y == 0])
        result_inv_C = inv_cov(result_Z)
    else:
        select_columns = df_sn['SN比'].astype(float).idxmax()
        result_scaler = 0
        result_inv_C = 0

    # 単位空間のスケーラーと共分散行列と選択した変数を出力
    return result_scaler, result_inv_C, select_columns

# 新しいデータのマハラノビス距離を計算する
def predict_MTS(X, scaler, inv_C, select_columns):
    Z = scaler.transform(X[select_columns])
    MD = cal_MD(Z, inv_C)
    return MD

def determine_threshold(y_true, y_pred):
    df_pred = pd.DataFrame(y_true)
    df_pred['pred'] = y_pred
    df_pred = df_pred.sort_values('pred').reset_index(drop=True)


    min_gini = np.inf
    threshold = 0
    for i in range(len(df_pred)):
        
        neg = df_pred.iloc[:i+1]
        pos = df_pred.iloc[i:]

        p_neg = sum(neg[y_true.name]) / len(neg)
        gini_neg = 1 - ( p_neg ** 2 + ( 1 - p_neg ) ** 2 )

        p_pos = sum(pos[y_true.name]) / len(pos)
        gini_pos = 1 - ( p_pos ** 2 + ( 1 - p_pos ) ** 2 )

        gini_split = (len(neg) / len(df_pred) * gini_neg) + (len(pos) / len(df_pred) * gini_pos)

        if min_gini > gini_split:
            min_gini = gini_split
            threshold = df_pred.iloc[i]['pred']
            threshold_idx = i
        # print(i, gini_split, df_pred.iloc[i]['pred'])

    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('Best paramater')
    print(threshold_idx, min_gini, threshold)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

    print('AUC : ', roc_auc_score(y_true.values, y_pred))

    recall = df_pred.iloc[threshold_idx + 1:][y_true.name].sum() / df_pred[y_true.name].sum()
    print('recall : ', recall)

    precision = df_pred.iloc[threshold_idx + 1:][y_true.name].mean()
    print('precision :', precision)

    g_mean = np.sqrt(recall * precision)
    print('g_mean : ', g_mean)

    RS = recall / precision
    print('RS : ', RS)
    return threshold

In [6]:

# K:再標本化の回数 SIZE:再標本化されたもののサンプルサイズ
K = n_estimators
SIZE = int(len(X) * max_samples)

# 予測に必要なパラメータ
select_columns = [0] * K
result_scaler = [0] * K
result_inv_C = [0] * K
threshold = [0] * K

for i in tqdm(range(K)):
    # bootstrap sampling
    resampled_data_x, resampled_data_y = resample(X_train, y_train, n_samples = SIZE)
    random_s = random.sample(list(resampled_data_x.columns), 7)
    resampled_data_x = resampled_data_x[random_s]

    result_scaler[i], result_inv_C[i], select_columns[i] = fit_MTS(resampled_data_x, resampled_data_y)

    if result_scaler[i] != 0:
        y_pred = predict_MTS(resampled_data_x, result_scaler[i], result_inv_C[i], select_columns[i])
    else:
        y_pred = resampled_data_x[select_columns[i]]

    threshold[i] = determine_threshold(resampled_data_y, y_pred)
    

 10%|█         | 1/10 [00:09<01:22,  9.17s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
355 0.047789998660306116 0.43411119245419666
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.8859121038961039
recall :  0.9856623376623377
precision : 0.98372044794691
g_mean :  0.9846909141093937
RS :  1.001974025974026


 20%|██        | 2/10 [00:18<01:13,  9.17s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
373 0.05167271527438706 0.5984283956204773
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9275144420109076
recall :  0.9888958726168029
precision : 0.9806773322252234
g_mean :  0.9847780289011228
RS :  1.0083804734967525


 30%|███       | 3/10 [00:27<01:04,  9.14s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
447 0.055602051711504355 0.534150118068655
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.8920185424591007
recall :  0.9766836681586343
precision : 0.9823073701842546
g_mean :  0.9794914831537941
RS :  0.9942750078068074


 40%|████      | 4/10 [00:36<00:54,  9.16s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
1777 0.07139598581279125 1.0292427080788324
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.8450689981278832
recall :  0.8446065659197499
precision : 0.9856482607638044
g_mean :  0.9124061555735422
RS :  0.8569046378322044


 50%|█████     | 5/10 [00:45<00:45,  9.16s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
446 0.06214180101427473 0.584504087514846
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9055693176326967
recall :  0.9748696558915537
precision : 0.9786454516905684
g_mean :  0.9767557293046295
RS :  0.9961418143899896


 60%|██████    | 6/10 [00:55<00:36,  9.23s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
180 0.04707046831924932 0.2793961908815195
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.8758935605646133
recall :  0.9977130977130977
precision : 0.9774926163560445
g_mean :  0.9875511056427762
RS :  1.0206860706860705


 70%|███████   | 7/10 [01:04<00:27,  9.23s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
560 0.061512970784615886 0.6288478406275305
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9013422623301346
recall :  0.9676307820820716
precision : 0.9817777306918106
g_mean :  0.9746785897823335
RS :  0.9855904771849222


 80%|████████  | 8/10 [01:13<00:18,  9.21s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
251 0.04145617792301046 0.6463003168164578
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9577856328030845
recall :  0.9965621418897802
precision : 0.9813295034878949
g_mean :  0.9889164938939643
RS :  1.015522450255235


 90%|█████████ | 9/10 [01:22<00:09,  9.20s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
556 0.057006899177766676 0.8884778332703787
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9326604231703088
recall :  0.9669094693028095
precision : 0.9840093190723287
g_mean :  0.9754219233199777
RS :  0.9826222684703434


100%|██████████| 10/10 [01:31<00:00,  9.19s/it]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
481 0.050132251374080845 0.7192746037762818
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9175100952380953
recall :  0.9751688311688311
precision : 0.9861315402395461
g_mean :  0.9806348665400988
RS :  0.9888831168831169





In [7]:
print(select_columns)
# print(result_scaler)
# print(result_inv_C)
# print(threshold)

[Int64Index([8, 7, 11], dtype='int64'), Int64Index([14, 11, 13, 15], dtype='int64'), Int64Index([1, 11, 7], dtype='int64'), Int64Index([15, 4, 1, 6, 12], dtype='int64'), Int64Index([10, 7, 12], dtype='int64'), Int64Index([12, 15, 14, 13], dtype='int64'), Int64Index([7, 1, 11], dtype='int64'), Int64Index([11, 15, 7, 9], dtype='int64'), Int64Index([13, 16, 7, 10], dtype='int64'), Int64Index([7, 15, 3, 11], dtype='int64')]


## とりあえずあとは予測だけ

In [8]:
def predict_MTSBag(X, scaler, inv_C, select_columns, threshold):
    result = []
    for i in range(K):
        Z = scaler[i].transform(X[select_columns[i]])
        MD = cal_MD(Z, inv_C[i])
        if MD > threshold[i]:
            result.append(True)
        else:
            result.append(False)
    return sum(result) / K