In [3]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np

import math
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.utils import resample

from tqdm import tqdm 

import tensorflow as tf



In [5]:
n_estimators = 10
max_samples = 0.5
select_data = 'abalone'

In [6]:
if select_data == 'letter':
    # データの取得
    df = pd.read_csv('../data/letter_recognition.csv', header=None)

    # Aのみを判定するため，Aを0，A以外を1にした．
    # 少数派のAを正常，その他を異常データと定義
    df[0] = df[0].apply(lambda x: 0 if x == 'A' else 1)

    #Xとyを入力
    X = df[range(1,17)]
    y = df[0]

elif select_data == 'wine':
    
    dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

    # ファイルのダウンロード
    dataset_path = tf.keras.utils.get_file('wine.data', dataset_url)

    print(dataset_path)

    column_names = ['Alcohol',
    'Malic acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline' 
    ]

    raw_data = pd.read_csv(dataset_path, names=column_names)
    raw_data['y'] = raw_data.index
    raw_data = raw_data.reset_index(drop=True)

    raw_data['y'] = raw_data['y'].apply(lambda x: 0 if x == 3 else 1)

    X = raw_data.drop('y', axis=1)
    y = raw_data['y']

elif select_data == 'abalone':

    dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"

    # ファイルのダウンロード
    dataset_path = tf.keras.utils.get_file('abalone.data', dataset_url)

    print(dataset_path)

    raw_data = pd.read_csv(dataset_path, names=range(8)).reset_index(drop=True)

    raw_data[7] = raw_data[7].apply(lambda x: 1 if x > 4 else 0)


    X = raw_data.drop(7, axis=1)
    y = raw_data[7]

else:
    print('そのデータはありません')

C:\Users\baseb\.keras\datasets\abalone.data


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
# 必要な関数の定義

# 共分散行列の逆行列
def inv_cov(Z):
    #標準化後のベクトルを入力する
    #標準化した後なので相関行列と分散共分散行列は一致する
    c = np.cov(Z.T)
    return np.linalg.inv(c)

#マハラノビス汎距離
def cal_MD(Z, inv_C):
    '''
    Z:標準化したベクトル
    inv_C:標準化後の共分散行列
    '''
    MD = np.zeros(len(Z))
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / Z.shape[1]
        MD[i] = _MD
    return MD

# MTSを実行
def fit_MTS(X, y):
    
    # 正常データのみを使用して標準化
    scaler = StandardScaler()
    scaler.fit(X[y == 0])
    normal_Z = scaler.transform(X[y == 0])
    anomaly_Z = scaler.transform(X[y == 1])

    # 正常データのみを使用して共分散行列を計算
    inv_C = inv_cov(normal_Z)

    # いったん飛ばす，削除の基準は？削除しない方法もあるっぽい？
        #１度目の仮のマハラノビス距離を計算
        # MD_1st = cal_MD(normal_Z, inv_C)
        # もしもマハラノビス距離が余りにも大きいサンプルがあれば任意で削除する
        # 削除後のデータを使用して標準化と共分散行列を計算

    # 異常データと直交表を用いてSN比を計算
    #L8直行表
    l8 = np.array([
        [1,1,1,1,1,1,1],
        [1,1,1,2,2,2,2],
        [1,2,2,1,1,2,2],
        [1,2,2,2,2,1,1],
        [2,1,2,1,2,1,2],
        [2,1,2,2,1,2,1],
        [2,2,1,1,2,2,1],
        [2,2,1,2,1,1,2]
        ])
    l8 = (l8 == 1)

    #異常データのマハラノビス距離
    result = np.zeros((l8.shape[0], anomaly_Z.shape[0]))
    for i, l8_row in enumerate(l8):
        result[i] = cal_MD(anomaly_Z[:, l8_row], inv_C[l8_row][:,l8_row])

    #SN比
    sn = np.zeros(l8.shape[0])
    for idx, row in enumerate(result):
        sum_MD = 0
        for i in range(len(row)):
            sum_MD += 1 / row[i]
        sn[idx] = -10 * math.log10(sum_MD / len(row))
        
    # SN比を利用し，不要と思われる変数を削除する
    #変数選択
    df_sn = pd.DataFrame(index=X.columns, columns=['SN比','残す'])
    for i, clm in enumerate(X.columns):
        df_sn.loc[df_sn.index == clm, 'SN比'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]])
        df_sn.loc[df_sn.index == clm, '残す'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]]) > 0
    #使用した変数を保存
    select_columns = df_sn[df_sn['残す']].index
    
    if len(select_columns) > 1:
        # 選択変数でのスケーラーと共分散行列を計算
        result_scaler = StandardScaler()
        result_scaler.fit(X[select_columns][y == 0])
        result_Z = result_scaler.transform(X[select_columns][y == 0])
        result_inv_C = inv_cov(result_Z)
    else:
        select_columns = df_sn['SN比'].astype(float).idxmax()
        result_scaler = 0
        result_inv_C = 0

    # 単位空間のスケーラーと共分散行列と選択した変数を出力
    return result_scaler, result_inv_C, select_columns

# 新しいデータのマハラノビス距離を計算する
def predict_MTS(X, scaler, inv_C, select_columns):
    Z = scaler.transform(X[select_columns])
    MD = cal_MD(Z, inv_C)
    return MD

# 閾値をジニ係数が最小になるように決定する
def determine_threshold(y_true, y_pred):
    df_pred = pd.DataFrame(y_true)
    df_pred['pred'] = y_pred
    df_pred = df_pred.sort_values('pred').reset_index(drop=True)

    min_gini = np.inf
    threshold = 0
    for i in range(len(df_pred)):
        
        neg = df_pred.iloc[:i+1]
        pos = df_pred.iloc[i:]

        p_neg = sum(neg[y_true.name]) / len(neg)
        gini_neg = 1 - ( p_neg ** 2 + ( 1 - p_neg ) ** 2 )

        p_pos = sum(pos[y_true.name]) / len(pos)
        gini_pos = 1 - ( p_pos ** 2 + ( 1 - p_pos ) ** 2 )

        gini_split = (len(neg) / len(df_pred) * gini_neg) + (len(pos) / len(df_pred) * gini_pos)

        if min_gini > gini_split:
            min_gini = gini_split
            threshold = df_pred.iloc[i]['pred']
            threshold_idx = i

    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('Best paramater')
    print(threshold_idx, min_gini, threshold)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

    print('AUC : ', roc_auc_score(y_true.values, y_pred))

    recall = df_pred.iloc[threshold_idx + 1:][y_true.name].sum() / df_pred[y_true.name].sum()
    print('recall : ', recall)

    precision = df_pred.iloc[threshold_idx + 1:][y_true.name].mean()
    print('precision :', precision)

    g_mean = np.sqrt(recall * precision)
    print('g_mean : ', g_mean)

    RS = recall / precision
    print('RS : ', RS)
    return threshold

In [9]:

# K:再標本化の回数 SIZE:再標本化されたもののサンプルサイズ
K = n_estimators
SIZE = int(len(X) * max_samples)

# 予測に必要なパラメータ
select_columns = [0] * K
result_scaler = [0] * K
result_inv_C = [0] * K
threshold = [0] * K

for i in tqdm(range(K)):
    # bootstrap sampling
    resampled_data_x, resampled_data_y = resample(X_train, y_train, n_samples = SIZE)
    random_s = random.sample(list(resampled_data_x.columns), 7)
    resampled_data_x = resampled_data_x[random_s]

    result_scaler[i], result_inv_C[i], select_columns[i] = fit_MTS(resampled_data_x, resampled_data_y)

    if result_scaler[i] != 0:
        y_pred = predict_MTS(resampled_data_x, result_scaler[i], result_inv_C[i], select_columns[i])
    else:
        y_pred = resampled_data_x[select_columns[i]]

    threshold[i] = determine_threshold(resampled_data_y, y_pred)
    

 10%|█         | 1/10 [00:00<00:06,  1.37it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
54 0.022349285415097658 0.0895
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.990579760741051
recall :  0.9887585532746823
precision : 0.9950811608460404
g_mean :  0.991914819422022
RS :  0.9936461388074291


 20%|██        | 2/10 [00:01<00:05,  1.38it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
48 0.020165729126899807 0.25
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9900095424288973
recall :  0.9916911045943304
precision : 0.9950956351152526
g_mean :  0.9933919113645135
RS :  0.9965786901270772


 30%|███       | 3/10 [00:02<00:05,  1.38it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
39 0.02078996342482738 0.17
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9879479507341173
recall :  0.9917194349732099
precision : 0.994140625
g_mean :  0.9929292919986367
RS :  0.9975645396980029


 40%|████      | 4/10 [00:02<00:04,  1.36it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
42 0.02049967167863024 0.0715
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9912904808635918
recall :  0.9960745829244357
precision : 0.9926650366748166
g_mean :  0.9943683484451512
RS :  1.0034347399411188


 50%|█████     | 5/10 [00:03<00:03,  1.37it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
42 0.016037828248424098 0.19
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9950873286479717
recall :  0.9926936190940088
precision : 0.9965770171149144
g_mean :  0.9946334228376383
RS :  0.9961032635168047


 60%|██████    | 6/10 [00:04<00:02,  1.36it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
37 0.020501412410631248 0.64671990156695
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.98488824101069
recall :  0.9902818270165209
precision : 0.9941463414634146
g_mean :  0.9922122027803227
RS :  0.9961127308066083


 70%|███████   | 7/10 [00:05<00:02,  1.35it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
47 0.016180707689053823 0.25
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9925113491486649
recall :  0.9931607230092818
precision : 0.9965686274509804
g_mean :  0.9948632160089061
RS :  0.9965803615046409


 80%|████████  | 8/10 [00:05<00:01,  1.35it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
29 0.015566506450514411 0.0155
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9936565935358415
recall :  0.9956246961594555
precision : 0.9951409135082604
g_mean :  0.9953827754424449
RS :  1.0004861448711717


 90%|█████████ | 9/10 [00:06<00:00,  1.35it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
45 0.014178259214331336 0.185
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9950076761244101
recall :  0.9951100244498777
precision : 0.9965719882468168
g_mean :  0.9958407380652556
RS :  0.9985330073349633


100%|██████████| 10/10 [00:07<00:00,  1.36it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Best paramater
57 0.015389842954095238 0.027
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AUC :  0.9953229911197748
recall :  0.9883040935672515
precision : 0.9990147783251232
g_mean :  0.9936450044924996
RS :  0.9892787524366471





In [10]:
print(select_columns)
print(result_scaler)
print(result_inv_C)
print(threshold)

[3, 0, 1, 3, 1, Int64Index([1, 4], dtype='int64'), 0, 5, 1, 6]
[0, 0, 0, 0, 0, StandardScaler(), 0, 0, 0, 0]
[0, 0, 0, 0, 0, array([[ 3.11232901, -2.58418404],
       [-2.58418404,  3.11232901]]), 0, 0, 0, 0]
[0.0895, 0.25, 0.17, 0.0715, 0.19, 0.64671990156695, 0.25, 0.0155, 0.185, 0.027]


## とりあえずあとは予測だけ
- 新しいデータをそれぞれの弱学習器に入れる
- 学習時に用いた変数を用いてマハラノビス距離を計算する
- 学習時に求めた閾値を超えたらTrue，超えなかったらFalseを各弱学習器で出力
- それらを集計（投票）して各データの出力を求める

In [11]:
def predict_MTSBag(X, scaler, inv_C, select_columns, threshold):
    result = np.ndarray((K, len(X_test)), dtype=bool)
    for i in range(K):
        if scaler[i] != 0:
            Z = scaler[i].transform(X[select_columns[i]])
            MD = cal_MD(Z, inv_C[i])
            result[i] = MD > threshold[i]
        else:
            result[i] = X[select_columns[i]] > threshold[i]
    return result.sum(axis=0) / K, result.sum(axis=0) > (K/2)

In [12]:
y_proba, y_pred = predict_MTSBag(X_test, result_scaler, result_inv_C, select_columns, threshold)

print('AUC : ', roc_auc_score(y_test, y_proba))

from sklearn.metrics import confusion_matrix
print('confusion_matrix : ')
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import accuracy_score
print('accuracy_score : ', accuracy_score(y_test, y_pred))

from sklearn.metrics import precision_score
print('precision_score : ', precision_score(y_test, y_pred))

from sklearn.metrics import recall_score
print('recall_score : ', recall_score(y_test, y_pred))

g_mean = np.sqrt(recall_score(y_test, y_pred) * precision_score(y_test, y_pred))
print('g_mean : ', g_mean)

RS = recall_score(y_test, y_pred) / precision_score(y_test, y_pred)
print('RS : ', RS)



AUC :  0.9582216808769792
confusion_matrix : 
[[ 13   2]
 [ 16 805]]
accuracy_score :  0.9784688995215312
precision_score :  0.9975216852540273
recall_score :  0.9805115712545676
g_mean :  0.9889800579227725
RS :  0.9829476248477466


In [13]:
import lightgbm
clf = lightgbm.LGBMClassifier()

In [14]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print('AUC : ', roc_auc_score(y_test, y_proba))

from sklearn.metrics import confusion_matrix
print('confusion_matrix : ')
print(confusion_matrix(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import accuracy_score
print('accuracy_score : ', accuracy_score(y_test, y_pred))

from sklearn.metrics import precision_score
print('precision_score : ', precision_score(y_test, y_pred))

from sklearn.metrics import recall_score
print('recall_score : ', recall_score(y_test, y_pred))

g_mean = np.sqrt(recall_score(y_test, y_pred) * precision_score(y_test, y_pred))
print('g_mean : ', g_mean)

RS = recall_score(y_test, y_pred) / precision_score(y_test, y_pred)
print('RS : ', RS)





AUC :  0.9900933820544051
confusion_matrix : 
[[ 10   5]
 [  7 814]]
accuracy_score :  0.9856459330143541
precision_score :  0.9938949938949939
recall_score :  0.9914738124238733
g_mean :  0.9926836649940765
RS :  0.9975639464068209
