In [104]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [105]:
import pandas as pd
import numpy as np

import math
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.utils import resample

from tqdm import tqdm 


In [106]:
n_estimators = 10
max_samples = 0.5
select_data = 'abalone'

In [107]:
if select_data == 'letter':
    # データの取得
    df = pd.read_csv('../data/letter_recognition.csv', header=None)

    # Aのみを判定するため，Aを0，A以外を1にした．
    # 少数派のAを正常，その他を異常データと定義
    df[0] = df[0].apply(lambda x: 0 if x == 'A' else 1)

    #Xとyを入力
    X = df[range(1,17)]
    y = df[0]

elif select_data == 'wine':

    import tensorflow as tf

    dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

    # ファイルのダウンロード
    dataset_path = tf.keras.utils.get_file('wine.data', dataset_url)

    print(dataset_path)

    column_names = ['Alcohol',
    'Malic acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline' 
    ]

    raw_data = pd.read_csv(dataset_path, names=column_names)
    raw_data['y'] = raw_data.index
    raw_data = raw_data.reset_index(drop=True)

    raw_data['y'] = raw_data['y'].apply(lambda x: 0 if x == 3 else 1)

    X = raw_data.drop('y', axis=1)
    y = raw_data['y']

elif select_data == 'abalone':

    dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"

    # ファイルのダウンロード
    dataset_path = tf.keras.utils.get_file('abalone.data', dataset_url)

    print(dataset_path)

    raw_data = pd.read_csv(dataset_path, names=range(8)).reset_index(drop=True)

    raw_data[7] = raw_data[7].apply(lambda x: 1 if x > 4 else 0)


    X = raw_data.drop(7, axis=1)
    y = raw_data[7]

else:
    print('そのデータはありません')

C:\Users\baseb\.keras\datasets\abalone.data


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [109]:
# 必要な関数の定義

# 共分散行列の逆行列
def inv_cov(Z):
    #標準化後のベクトルを入力する
    #標準化した後なので相関行列と分散共分散行列は一致する
    c = np.cov(Z.T)
    return np.linalg.inv(c)

#マハラノビス汎距離
def cal_MD(Z, inv_C):
    '''
    Z:標準化したベクトル
    inv_C:標準化後の共分散行列
    '''
    MD = []
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / Z.shape[1]
        MD.append(_MD)
    return MD

# MTSを実行
def fit_MTS(X, y):
    
    # 正常データのみを使用して標準化
    scaler = StandardScaler()
    scaler.fit(X[y == 0])
    normal_Z = scaler.transform(X[y == 0])
    anomaly_Z = scaler.transform(X[y == 1])

    # 正常データのみを使用して共分散行列を計算
    inv_C = inv_cov(normal_Z)

    # いったん飛ばす，削除の基準は？削除しない方法もあるっぽい？
        #１度目の仮のマハラノビス距離を計算
        # MD_1st = cal_MD(normal_Z, inv_C)
        # もしもマハラノビス距離が余りにも大きいサンプルがあれば任意で削除する
        # 削除後のデータを使用して標準化と共分散行列を計算

    # 異常データと直交表を用いてSN比を計算
    #L8直行表
    df_l8 = pd.DataFrame([[1,1,1,1,1,1,1],[1,1,1,2,2,2,2],[1,2,2,1,1,2,2],[1,2,2,2,2,1,1],[2,1,2,1,2,1,2],[2,1,2,2,1,2,1],[2,2,1,1,2,2,1],[2,2,1,2,1,1,2]])
    l8 = (df_l8==1).values

    #異常データのマハラノビス距離
    result = np.zeros((l8.shape[0], anomaly_Z.shape[0]))
    for x, l8_row in enumerate(l8):
        result[x] = cal_MD(anomaly_Z[:, l8_row], inv_C[l8_row][:,l8_row])

    #SN比
    sn = np.zeros(l8.shape[0])
    for idx, row in enumerate(result):
        sum_MD = 0
        for i in range(len(row)):
            sum_MD += 1 / row[i]
        sn[idx] = -10 * math.log10(sum_MD / len(row))
        
    # SN比を利用し，不要と思われる変数を削除する
    #変数選択
    df_sn = pd.DataFrame(index=X.columns, columns=['SN比','残す'])
    for i, clm in enumerate(X.columns):
        df_sn.loc[df_sn.index == clm, 'SN比'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]])
        df_sn.loc[df_sn.index == clm, '残す'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]]) > 0
    select_columns = df_sn[df_sn['残す']].index
    
    if len(select_columns) > 1:
        # 選択変数でのスケーラーと共分散行列を計算
        result_scaler = StandardScaler()
        result_scaler.fit(X[select_columns][y == 0])
        result_Z = result_scaler.transform(X[select_columns][y == 0])
        result_inv_C = inv_cov(result_Z)
    else:
        select_columns = df_sn['SN比'].astype(float).idxmax()
        result_scaler = 0
        result_inv_C = 0

    # 単位空間のスケーラーと共分散行列と選択した変数を出力
    return result_scaler, result_inv_C, select_columns

# 新しいデータのマハラノビス距離を計算する
def predict_MTS(X, scaler, inv_C, select_columns):
    Z = scaler.transform(X[select_columns])
    MD = cal_MD(Z, inv_C)
    return MD

def determine_threshold(y_true, y_pred):
    df_pred = pd.DataFrame(y_true)
    df_pred['pred'] = y_pred
    df_pred = df_pred.sort_values('pred').reset_index(drop=True)


    min_gini = np.inf
    threshold = 0
    for i in range(len(df_pred)):
        
        neg = df_pred.iloc[:i+1]
        pos = df_pred.iloc[i:]

        p_neg = sum(neg[y_true.name]) / len(neg)
        gini_neg = 1 - ( p_neg ** 2 + ( 1 - p_neg ) ** 2 )

        p_pos = sum(pos[y_true.name]) / len(pos)
        gini_pos = 1 - ( p_pos ** 2 + ( 1 - p_pos ) ** 2 )

        gini_split = (len(neg) / len(df_pred) * gini_neg) + (len(pos) / len(df_pred) * gini_pos)

        if min_gini > gini_split:
            min_gini = gini_split
            threshold = df_pred.iloc[i]['pred']
            threshold_idx = i
        # print(i, gini_split, df_pred.iloc[i]['pred'])

    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('Best paramater')
    print(threshold_idx, min_gini, threshold)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

    print('AUC : ', roc_auc_score(y_true.values, y_pred))

    recall = df_pred.iloc[threshold_idx + 1:][y_true.name].sum() / df_pred[y_true.name].sum()
    print('recall : ', recall)

    precision = df_pred.iloc[threshold_idx + 1:][y_true.name].mean()
    print('precision :', precision)

    g_mean = np.sqrt(recall * precision)
    print('g_mean : ', g_mean)

    RS = recall / precision
    print('RS : ', RS)
    return threshold

In [110]:

# K:再標本化の回数 SIZE:再標本化されたもののサンプルサイズ
K = n_estimators
SIZE = int(len(X) * max_samples)

# 予測に必要なパラメータ
select_columns = [0] * K
result_scaler = [0] * K
result_inv_C = [0] * K
threshold = [0] * K

for i in tqdm(range(K)):
    # bootstrap sampling
    resampled_data_x, resampled_data_y = resample(X_train, y_train, n_samples = SIZE)
    random_s = random.sample(list(resampled_data_x.columns), 7)
    resampled_data_x = resampled_data_x[random_s]

    result_scaler[i], result_inv_C[i], select_columns[i] = fit_MTS(resampled_data_x, resampled_data_y)

    if result_scaler[i] != 0:
        y_pred = predict_MTS(resampled_data_x, result_scaler[i], result_inv_C[i], select_columns[i])
    else:
        y_pred = resampled_data_x[select_columns[i]]

    threshold[i] = determine_threshold(resampled_data_y, y_pred)
    

 10%|█         | 1/10 [00:00<00:06,  1.35it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
48 0.02007691007437465 0.015
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9890626853084192
recall :  0.9897610921501706
precision : 0.9955860716037274
g_mean :  0.9926693092666878
RS :  0.9941491955143832


 20%|██        | 2/10 [00:01<00:05,  1.38it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
56 0.015364299261821035 0.175
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9949544966484931
recall :  0.9926506614404703
precision : 0.9975381585425899
g_mean :  0.9950914093134412
RS :  0.9951004409603135


 30%|███       | 3/10 [00:02<00:06,  1.15it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
42 0.018219244814686765 0.067
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.993328857421875
recall :  0.99365234375
precision : 0.9951100244498777
g_mean :  0.9943809169949617
RS :  0.99853515625


 40%|████      | 4/10 [00:03<00:05,  1.12it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
61 0.02304292496239585 1.0717866048011941
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9860134994085311
recall :  0.9824646858256211
precision : 0.9955577492596249
g_mean :  0.9889895506766591
RS :  0.9868485143692157


 50%|█████     | 5/10 [00:04<00:04,  1.06it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
51 0.018231453376316424 0.0225
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9902893066406249
recall :  0.99072265625
precision : 0.9965618860510805
g_mean :  0.9936379817951991
RS :  0.994140625


 60%|██████    | 6/10 [00:05<00:03,  1.13it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
27 0.01690078229756005 0.165
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9875136794747081
recall :  0.9961089494163424
precision : 0.9941747572815534
g_mean :  0.9951413834284932
RS :  1.0019455252918288


 70%|███████   | 7/10 [00:06<00:02,  1.19it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
35 0.024529914171019736 0.05
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.984364569961489
recall :  0.9926829268292683
precision : 0.9917153996101364
g_mean :  0.9921990452860996
RS :  1.0009756097560976


 80%|████████  | 8/10 [00:06<00:01,  1.23it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
17 0.018187259894025993 0.0145
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9903871072238418
recall :  0.9980563654033042
precision : 0.9922705314009662
g_mean :  0.9951592435720293
RS :  1.0058309037900874


 90%|█████████ | 9/10 [00:07<00:00,  1.26it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
42 0.015754330432137892 0.015
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9903398309470415
recall :  0.9912493923189111
precision : 0.9970660146699266
g_mean :  0.9941534494952996
RS :  0.9941662615459408


100%|██████████| 10/10 [00:08<00:00,  1.20it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
37 0.016529496036194895 0.046
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9948347969068356
recall :  0.9960918417195896
precision : 0.9946341463414634
g_mean :  0.9953627271836434
RS :  1.0014655593551538





In [111]:
print(select_columns)
print(result_scaler)
print(result_inv_C)
print(threshold)

[5, 1, 3, Int64Index([5, 1], dtype='int64'), 6, 1, 2, 4, 5, 3]
[0, 0, 0, StandardScaler(), 0, 0, 0, 0, 0, 0]
[0, 0, 0, array([[ 8.19243491, -7.69139938],
       [-7.69139938,  8.19243491]]), 0, 0, 0, 0, 0, 0]
[0.015, 0.175, 0.067, 1.0717866048011941, 0.0225, 0.165, 0.05, 0.0145, 0.015, 0.046]


## とりあえずあとは予測だけ
- 新しいデータをそれぞれの弱学習器に入れる
- 学習時に用いた変数を用いてマハラノビス距離を計算する
- 学習時に求めた閾値を超えたらTrue，超えなかったらFalseを各弱学習器で出力
- それらを集計（投票）して各データの出力を求める

In [112]:
def predict_MTSBag(X, scaler, inv_C, select_columns, threshold):
    result = np.ndarray((K, len(X_test)), dtype=bool)
    for i in range(K):
        if scaler[i] != 0:
            Z = scaler[i].transform(X[select_columns[i]])
            MD = cal_MD(Z, inv_C[i])
            result[i] = MD > threshold[i]
        else:
            result[i] = X[select_columns[i]] > threshold[i]
    return result.sum(axis=0) > (K/2)

In [113]:
y_pred = predict_MTSBag(X_test, result_scaler, result_inv_C, select_columns, threshold)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

from sklearn.metrics import precision_score
print(precision_score(y_test, y_pred))

from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred))

g_mean = np.sqrt(recall_score(y_test, y_pred) * precision_score(y_test, y_pred))
print('g_mean : ', g_mean)

RS = recall_score(y_test, y_pred) / precision_score(y_test, y_pred)
print('RS : ', RS)



[[  9   4]
 [  7 816]]
0.9868421052631579
0.9951219512195122
0.991494532199271
g_mean :  0.9933065858563589
RS :  0.9963547995139733


In [114]:
import lightgbm
clf = lightgbm.LGBMClassifier()

In [115]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

from sklearn.metrics import precision_score
print(precision_score(y_test, y_pred))

from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred))

g_mean = np.sqrt(recall_score(y_test, y_pred) * precision_score(y_test, y_pred))
print('g_mean : ', g_mean)

RS = recall_score(y_test, y_pred) / precision_score(y_test, y_pred)
print('RS : ', RS)




[[  5   8]
 [  7 816]]
0.9820574162679426
0.9902912621359223
0.991494532199271
g_mean :  0.9908927145218507
RS :  1.0012150668286757
