In [1]:
import pandas as pd
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy.stats import chi2
import matplotlib.dates as mdates
import random
from sklearn.model_selection import train_test_split

In [2]:
df_l8 = np.array([
        [1,1,1,1,1,1,1],
        [1,1,1,2,2,2,2],
        [1,2,2,1,1,2,2],
        [1,2,2,2,2,1,1],
        [2,1,2,1,2,1,2],
        [2,1,2,2,1,2,1],
        [2,2,1,1,2,2,1],
        [2,2,1,2,1,1,2]
        ])
df_l8==1

array([[ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False, False, False],
       [ True, False, False,  True,  True, False, False],
       [ True, False, False, False, False,  True,  True],
       [False,  True, False,  True, False,  True, False],
       [False,  True, False, False,  True, False,  True],
       [False, False,  True,  True, False, False,  True],
       [False, False,  True, False,  True,  True, False]])

## データの取得

In [3]:
df = pd.read_csv('../data/letter_recognition.csv', header=None)

#Aのみを判定するため，Aを１，A以外を0にした．
df[0] = df[0].apply(lambda x: 1 if x == 'A' else 0)

#Xとyを入力
X = df[range(1,17)]
y = df[0]



In [4]:
#バギング側の話
#ブートストラップサンプリングの個数
n = 10
seed = random.randint(0, n)

#使用する7つの変数をランダムに取得する
random.seed(1)
random_s = random.sample(list(X.columns), 7)
use_X = X[random_s]

X_train, X_test, y_train, y_test = train_test_split(use_X, y, test_size=0.2)


### Xとyが入力されたら自動的にMDを求めるためのscalerとinv_covと使用する変数(select_columns)を出力するアルゴリズムを作る

In [5]:
normal_idx = y_train[y_train == 0].index.to_list()
anomaly_idx = y_train[y_train == 1].index.to_list()

#通常状態の説明変数と目的変数を定義
normal_X = X_train.loc[normal_idx]
normal_y = y_train.loc[normal_idx]
anomaly_X = X_train.loc[anomaly_idx]
anomaly_y = y_train.loc[anomaly_idx]

## 必要な関数の定義

In [6]:
#各説明変数ごとに標準化
def transform_standard(fit_X, transform_X):
    scaler = StandardScaler()
    scaler.fit(fit_X)
    return scaler.transform(transform_X)

#共分散行列の逆行列
def inv_cov(x):
    #標準化後のベクトルを入力する
    #標準化した後なので相関行列と分散共分散行列は一致する
    cov = np.cov(x.T)
    return np.linalg.inv(cov)

#マハラノビス汎距離
def cal_MD(Z, inv_C, k):
    MD = []
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / k
        MD.append(_MD)
    return MD


In [7]:
#標準化
normal_Z = transform_standard(normal_X, normal_X)
anomaly_Z = transform_standard(normal_X, anomaly_X) #異常データも正常データの平均と分散で標準化

#共分散行列の逆行列
inv_C = inv_cov(normal_Z)

#マハラノビス距離の計算
normal_MD = cal_MD(normal_Z, inv_C, len(use_X.columns))
anomaly_MD = cal_MD(anomaly_Z, inv_C, len(use_X.columns))


## ここまではあってる

### 正常データの中の異常なデータを任意に削除することもある

In [12]:
# def del_anomaly(x):
#     #マハラノビス距離がx以上の通常データを削除する
#     df_MD = pd.DataFrame(index=normal_idx, data=normal_MD)
#     del_anomaly_idx = df_MD.loc[df_MD[0] > x].index.to_list()
#     for i in del_anomaly_idx:
#         normal_idx.remove(i)
#     return normal_idx
# len(del_anomaly(3))

In [13]:
#L8直行表
df_l8 = pd.DataFrame([[1,1,1,1,1,1,1],[1,1,1,2,2,2,2],[1,2,2,1,1,2,2],[1,2,2,2,2,1,1],[2,1,2,1,2,1,2],[2,1,2,2,1,2,1],[2,2,1,1,2,2,1],[2,2,1,2,1,1,2]])
l8 = (df_l8==1).values
l8

array([[ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False, False, False],
       [ True, False, False,  True,  True, False, False],
       [ True, False, False, False, False,  True,  True],
       [False,  True, False,  True, False,  True, False],
       [False,  True, False, False,  True, False,  True],
       [False, False,  True,  True, False, False,  True],
       [False, False,  True, False,  True,  True, False]])

In [14]:
#異常データのマハラノビス距離
result = np.zeros((l8.shape[0], anomaly_Z.shape[0]))
for x, l8_row in enumerate(l8):
    result[x] = cal_MD(anomaly_Z[:, l8_row], inv_C[l8_row][:,l8_row], sum(l8_row))
result.shape

(8, 643)

In [15]:
result

array([[0.85555235, 0.71205278, 1.12871799, ..., 0.44420539, 0.90092751,
        0.66494553],
       [1.41592954, 1.04484537, 1.05235631, ..., 0.97847154, 2.21352656,
        1.62962493],
       [0.82488977, 0.15453734, 1.46687009, ..., 0.03666925, 0.6359416 ,
        0.23641237],
       ...,
       [1.14582048, 0.87710294, 1.26584073, ..., 0.03183937, 1.61980205,
        0.5776265 ],
       [1.03486981, 0.5407778 , 0.43389558, ..., 0.91115455, 1.06356933,
        1.06356933],
       [2.30714143, 0.50968757, 1.20627759, ..., 0.97519298, 1.66895072,
        0.97519298]])

In [16]:
#SN比
sn = np.zeros(l8.shape[0])
for idx, row in enumerate(result):
    sum_MD = 0
    for i in range(len(row)):
        sum_MD += 1 / row[i]
    sn[idx] = -10 * math.log10(sum_MD / len(row))
sn

array([-1.80379223,  0.35244603, -5.23489534, -5.08561283, -6.87438667,
       -5.72722409, -2.71360926,  0.17844181])

In [17]:
#変数選択
df_sn = pd.DataFrame(index=X_train.columns, columns=['SN比','残す'])
for i, clm in enumerate(X_train.columns):
    df_sn.loc[df_sn.index == clm, 'SN比'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]])
    df_sn.loc[df_sn.index == clm, '残す'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]]) > 0
df_sn
# select_v = df_sn.reset_index()

Unnamed: 0,SN比,残す
5,3.364924,True
10,-1.197281,False
14,18.935605,True
13,-6.344734,False
2,1.733693,True
16,-0.262067,False
12,-3.751844,False


In [18]:
normal_X_d = normal_X.drop(columns=df_sn[df_sn['残す'] == False].index)
anomaly_X_d = anomaly_X.drop(columns=df_sn[df_sn['残す'] == False].index)

normal_Z_d = transform_standard(normal_X_d, normal_X_d)
anomaly_Z_d = transform_standard(normal_X_d, anomaly_X_d)

#共分散行列の逆行列
inv_C_d = inv_cov(normal_Z_d)

#マハラノビス距離の計算
normal_MD = cal_MD(normal_Z_d, inv_C_d, len(normal_X_d.columns))
anomaly_MD = cal_MD(anomaly_Z_d, inv_C_d, len(normal_X_d.columns))

In [28]:
type(df_sn.index)

pandas.core.indexes.numeric.Int64Index

In [30]:
type(df_sn.index) == pd.core.indexes.numeric.Int64Index

True

In [19]:
type(df_sn['SN比'].astype(float).idxmax()) == np.int64

True

In [20]:
Z = transform_standard(normal_X_d, X_test.drop(columns=df_sn[df_sn['残す'] == False].index))
inv_C_d = inv_cov(normal_Z_d)

MD = cal_MD(Z, inv_C_d, len(normal_X_d.columns))
MD

[1.1547789329860498,
 0.6288270505010052,
 0.8189021921438929,
 0.049951496047596404,
 1.8190458504109,
 0.20050241483935252,
 1.6594621592426693,
 0.20050241483935252,
 0.8976779900440972,
 0.9092390434650318,
 0.854690226102213,
 2.2690364755034755,
 0.5270897551209449,
 0.5105275048242489,
 1.3114522428580748,
 0.1324102048909119,
 0.8822418580261077,
 0.8946744994727616,
 0.4253010350371575,
 1.5880610065337821,
 1.2999713806279714,
 1.433596469169098,
 0.6315280352049769,
 0.5270897551209449,
 6.265695676372108,
 1.1783005258487622,
 4.290365598091508,
 1.1664989031163755,
 0.6717157769922196,
 0.1818390690880627,
 1.6323897858895589,
 0.18854972014465074,
 1.4254404486579009,
 0.3221623646022154,
 0.4842399430615192,
 0.5847016313511625,
 0.34472348374532963,
 3.195601626292364,
 6.403483492655638,
 2.208379017170231,
 0.05265730338091836,
 1.4254404486579009,
 0.25281133678401657,
 0.061458549353323476,
 1.24930019307662,
 2.340040315453004,
 0.3489736484555792,
 0.0499514960475

In [21]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test.values, MD) 

0.6904816913223052

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [23]:
clf = LogisticRegression(solver="liblinear", random_state=0).fit(use_X, y)
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.9134700826751783

In [24]:
confusion_matrix(y_test, clf.predict(X_test))

array([[3846,    8],
       [ 137,    9]], dtype=int64)