# グラムシュミット法によってマハラノビス距離を計算する関数を作成する
## 先生に言われた方法を試す．
- Ul = Al - sigma(tlq*Uq)
- グラムシュミット法をする際にtqlを保存できれば先生に言われた方法で計算が可能！！！

In [1]:
import pandas as pd
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy.stats import chi2
import matplotlib.dates as mdates
import random
from sklearn.model_selection import train_test_split

## データの取得

In [2]:
df = pd.read_csv('../data/letter_recognition.csv', header=None)

#Aのみを判定するため，Aを１，A以外を0にした．
df[0] = df[0].apply(lambda x: 1 if x == 'A' else 0)

#Xとyを入力
X = df[range(1,17)]
y = df[0]



In [3]:
#バギング側の話
#ブートストラップサンプリングの個数
n = 10
seed = random.randint(0, n)

#使用する7つの変数をランダムに取得する
random.seed(1)
random_s = random.sample(list(X.columns), 7)
use_X = X[random_s]

X_train, X_test, y_train, y_test = train_test_split(use_X, y, test_size=0.2)


In [4]:
normal_idx = y_train[y_train == 0].index.to_list()
anomaly_idx = y_train[y_train == 1].index.to_list()

#通常状態の説明変数と目的変数を定義
normal_X = X_train.loc[normal_idx]
normal_y = y_train.loc[normal_idx]
anomaly_X = X_train.loc[anomaly_idx]
anomaly_y = y_train.loc[anomaly_idx]

In [5]:
#各説明変数ごとに標準化
def transform_standard(fit_X, transform_X):
    scaler = StandardScaler()
    scaler.fit(fit_X)
    return scaler.transform(transform_X)

#共分散行列の逆行列
def inv_cov(x):
    #標準化後のベクトルを入力する
    #標準化した後なので相関行列と分散共分散行列は一致する
    cov = np.cov(x.T)
    return np.linalg.inv(cov)

#マハラノビス汎距離
def cal_MD(Z, inv_C, k):
    MD = []
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / k
        MD.append(_MD)
    return MD


In [6]:
#標準化
normal_Z = transform_standard(normal_X, normal_X)
anomaly_Z = transform_standard(normal_X, anomaly_X) #異常データも正常データの平均と分散で標準化

#共分散行列の逆行列
inv_C = inv_cov(normal_Z)

#マハラノビス距離の計算
normal_MD = cal_MD(normal_Z, inv_C, len(use_X.columns))
anomaly_MD = cal_MD(anomaly_Z, inv_C, len(use_X.columns))


In [7]:
normal_X.head()

Unnamed: 0,5,10,14,13,2,16,12
8954,3,6,8,3,9,11,7
886,4,7,6,9,5,8,8
12818,6,11,8,3,9,12,8
19146,4,15,7,0,10,7,13
12605,2,9,8,2,4,8,8


In [12]:
normal_A = normal_Z.T
# グラムシュミットの直交化
# tをトレーニングデータで求めた！！
def fit_gram_schmidt(normal_A):
    normal_U = np.zeros(normal_A.shape)
    t = np.zeros((normal_A.shape[0], normal_A.shape[0]))
    for l in range(normal_A.shape[0]):
        sigma = 0 
        for q in range(l):
            t[l][q] = np.dot(normal_A[l], normal_U[q]) / np.dot(normal_U[q], normal_U[q])
            sigma +=  t[l][q] * normal_U[q]
        normal_U[l] = normal_A[l] - sigma
    return normal_U, t
normal_U, t = fit_gram_schmidt(normal_A)

(7, 15377)

In [66]:
#マハラノビス汎距離
def cal_MD(Z, inv_C):
    '''
    Z:標準化したベクトル
    inv_C:標準化後の共分散行列
    '''
    MD = np.zeros(len(Z))
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / Z.shape[1]
        MD[i] = _MD
    return MD
cal_MD(normal_Z, inv_C)

array([0.94059372, 1.75037816, 1.61109921, ..., 0.4535942 , 1.15994396,
       1.00223367])

In [67]:
#グラムシュミット法によるマハラノビス汎距離
normal_gram_vec, _ = np.linalg.qr(normal_Z)
normal_ips = np.diag(np.cov(normal_gram_vec.T))
def gram_schmidt_cal_MD(gram_vec, ips):
    '''
    Z:標準化したベクトル
    '''
    
    k = gram_vec.shape[1]
    MD = np.zeros(gram_vec.shape[0])
    
    for i, one_gram_vec in enumerate(gram_vec):
        _MD = 0
        for q, u in enumerate(one_gram_vec):
            _MD += u**2 / ips[q]
        _MD = _MD / k
        MD[i] = _MD
    return MD

gram_schmidt_cal_MD(normal_gram_vec, normal_ips)

array([0.94059372, 1.75037816, 1.61109921, ..., 0.4535942 , 1.15994396,
       1.00223367])

In [68]:
normal_ips = np.diag(np.cov(normal_U))
gram_schmidt_cal_MD(normal_U.T, normal_ips)


array([0.94059372, 1.75037816, 1.61109921, ..., 0.4535942 , 1.15994396,
       1.00223367])

## 異常データでの逆行列MDとGramschmidtMDの比較

In [70]:
conpare_df = pd.DataFrame(cal_MD(anomaly_Z, inv_C), index=anomaly_idx, columns=['InvMD'])

def create_gramMD_one_by_one():
    MD = []
    for i in range(len(anomaly_X)):
        sample = normal_X.reset_index(drop=True)
        sample = sample.append(anomaly_X.iloc[i]).reset_index(drop=True)
        sample_Z = transform_standard(normal_X, sample)
        anomaly_gram_vec, _ = np.linalg.qr(sample_Z)
        anomaly_ips = np.diag(np.cov(anomaly_gram_vec.T))
        MD.append(gram_schmidt_cal_MD(anomaly_gram_vec, anomaly_ips)[-1])
    return pd.DataFrame(MD, index=anomaly_idx, columns=['gramMD_one_by_one'])
conpare_df = pd.concat([conpare_df, create_gramMD_one_by_one()], axis=1)

conpare_df = conpare_df[['InvMD', 'gramMD_one_by_one']]
def create_gramMD_anomaly_only():
    sample_Z = transform_standard(normal_X, anomaly_X)
    anomaly_gram_vec, _ = np.linalg.qr(sample_Z)
    anomaly_ips = np.diag(np.cov(anomaly_gram_vec.T))
    return pd.DataFrame(gram_schmidt_cal_MD(anomaly_gram_vec, anomaly_ips), index=anomaly_idx, columns=['gramMD_anomaly_only'])
    
conpare_df = pd.concat([conpare_df, create_gramMD_anomaly_only()], axis=1)

def create_gramMD_all_sample():
    sample_Z = transform_standard(normal_X, X_train)
    anomaly_gram_vec, _ = np.linalg.qr(sample_Z)
    anomaly_ips = np.diag(np.cov(anomaly_gram_vec.T))
    return pd.DataFrame(gram_schmidt_cal_MD(anomaly_gram_vec, anomaly_ips), index=X_train.index, columns=['gramMD_all_sample']).loc[anomaly_idx]

conpare_df = pd.concat([conpare_df, create_gramMD_all_sample()], axis=1)
conpare_df

Unnamed: 0,InvMD,gramMD_one_by_one,gramMD_anomaly_only,gramMD_all_sample
327,0.328947,0.328919,0.366876,0.317889
234,0.436457,0.436399,0.627702,0.423556
10671,1.170488,1.169940,1.696205,1.121944
3045,1.412758,1.411942,1.658791,1.368622
1699,0.480384,0.480310,0.786520,0.471267
...,...,...,...,...
15175,0.806054,0.805811,1.239578,0.795417
19853,0.670865,0.670704,1.208708,0.659148
19597,4.870502,4.860043,8.320051,4.679555
10247,1.236171,1.235556,2.252303,1.187392


In [74]:
anomaly_A = anomaly_Z.T
anomaly_U = np.zeros(anomaly_A.shape)
for l in range(anomaly_A.shape[0]):
    sigma = 0 
    for q in range(l):
        sigma +=  t[l][q] * anomaly_U[q]
    anomaly_U[l] = anomaly_A[l] - sigma
pd.concat([conpare_df, pd.DataFrame(gram_schmidt_cal_MD(anomaly_U.T, normal_ips), index=anomaly_idx, columns=['gram'])], axis=1)

Unnamed: 0,InvMD,gramMD_one_by_one,gramMD_anomaly_only,gramMD_all_sample,gram
327,0.328947,0.328919,0.366876,0.317889,0.328947
234,0.436457,0.436399,0.627702,0.423556,0.436457
10671,1.170488,1.169940,1.696205,1.121944,1.170488
3045,1.412758,1.411942,1.658791,1.368622,1.412758
1699,0.480384,0.480310,0.786520,0.471267,0.480384
...,...,...,...,...,...
15175,0.806054,0.805811,1.239578,0.795417,0.806054
19853,0.670865,0.670704,1.208708,0.659148,0.670865
19597,4.870502,4.860043,8.320051,4.679555,4.870502
10247,1.236171,1.235556,2.252303,1.187392,1.236171


# できた！！！

## やったこと
- 今までの逆行列MDとグラムシュミットMDが異常データにおいてどのくらい差があるかを検証した．
    - gramMD_one_by_one: 通常データに異常データを一つだけくっつけて直交化を行いGramschmidtMDを求める方法 
    - gramMD_anomaly_only: 異常データだけで直交化を行いGramschmidtMDを求める方法 
    - gramMD_all_sample: 通常データに異常データをすべてくっつけて直交化を行いGramschmidtMDを求める方法

## 結果

In [None]:
conpare_df

Unnamed: 0,InvMD,gramMD_one_by_one,gramMD_anomaly_only,gramMD_all_sample
19620,2.277088,2.274877,2.795716,2.186991
10569,0.780341,0.780115,2.797163,0.791389
4225,1.086459,1.085993,0.947095,1.057798
13702,0.742296,0.742094,0.751413,0.730570
19597,4.860878,4.850460,8.147387,4.663684
...,...,...,...,...
18377,0.905111,0.904798,1.588327,0.884619
19888,0.513390,0.513304,0.854800,0.506159
3025,0.703537,0.703358,0.883006,0.674589
9063,1.104459,1.103976,0.850638,1.053029


#### RMSE

In [None]:
from sklearn.metrics import mean_squared_error
print('gramMD_one_by_one:', np.sqrt(mean_squared_error(conpare_df['InvMD'], conpare_df['gramMD_one_by_one'])))
print('gramMD_anomaly_only:', np.sqrt(mean_squared_error(conpare_df['InvMD'], conpare_df['gramMD_anomaly_only'])))
print('gramMD_all_sample:', np.sqrt(mean_squared_error(conpare_df['InvMD'], conpare_df['gramMD_all_sample'])))

gramMD_one_by_one: 0.0012169326516005816
gramMD_anomaly_only: 0.6993212728913734
gramMD_all_sample: 0.039443899340899154


#### 計算時間

In [None]:
print('gramMD_one_by_one:', '1m 03s')
print('gramMD_anomaly_only:', '0.8s')
print('gramMD_all_sample:', '0.1s')

gramMD_one_by_one: 1m 03s
gramMD_anomaly_only: 0.8s
gramMD_all_sample: 0.1s


In [None]:
len(X_train)

16000

In [None]:
len(X_test)

4000

# できた！！！！！

In [None]:
#L8直行表
l8 = np.array([
    [1,1,1,1,1,1,1],
    [1,1,1,2,2,2,2],
    [1,2,2,1,1,2,2],
    [1,2,2,2,2,1,1],
    [2,1,2,1,2,1,2],
    [2,1,2,2,1,2,1],
    [2,2,1,1,2,2,1],
    [2,2,1,2,1,1,2]
    ])
l8 = (l8 == 1)
l8

array([[ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True, False, False, False, False],
       [ True, False, False,  True,  True, False, False],
       [ True, False, False, False, False,  True,  True],
       [False,  True, False,  True, False,  True, False],
       [False,  True, False, False,  True, False,  True],
       [False, False,  True,  True, False, False,  True],
       [False, False,  True, False,  True,  True, False]])

In [None]:
#異常データのマハラノビス距離
result = np.zeros((l8.shape[0], anomaly_Z.shape[0]))
for i, l8_row in enumerate(l8):
    result[i] = cal_MD(anomaly_Z[:, l8_row], inv_C[l8_row][:,l8_row])
result


array([[0.54897735, 0.80594764, 0.25822561, ..., 0.81127319, 1.34173595,
        1.12818194],
       [0.81414475, 1.66691331, 0.81414475, ..., 1.17608226, 0.72392134,
        3.02201274],
       [0.78876927, 0.34304566, 0.16598893, ..., 0.3258575 , 0.24768548,
        1.7340828 ],
       ...,
       [0.33831351, 0.30805763, 0.11758412, ..., 1.10111187, 0.24692022,
        2.20757785],
       [0.31521762, 0.9182262 , 0.43578507, ..., 0.37073917, 0.74006328,
        1.04321135],
       [0.48867644, 1.11251288, 0.36091576, ..., 1.24554322, 2.67686101,
        3.2350077 ]])

In [None]:
#異常データのマハラノビス距離
result = np.zeros((l8.shape[0], anomaly_Z.shape[0]))

for i, l8_row in enumerate(l8):
    _gram_vec, _ = np.linalg.qr(anomaly_Z[:, l8_row])
    result[i] = gram_schmidt_cal_MD(_gram_vec, normal_ips[l8_row])
result


array([[16.39907068, 13.18763491,  6.47958334, ..., 18.24982226,
        50.79338466, 21.38352322],
       [ 9.43740552, 20.90304841,  9.43740552, ..., 37.0755016 ,
        16.4307721 , 32.49439143],
       [24.94408966, 12.37357899,  7.20399173, ...,  8.50702906,
        18.3343913 , 43.59116868],
       ...,
       [ 9.66673412, 16.73425211,  5.76037926, ..., 34.17653701,
        16.73557682, 39.58413969],
       [ 3.65433844, 11.4164381 ,  6.70474677, ...,  5.4139151 ,
        21.75214394, 12.53635637],
       [ 4.86159423, 10.49390799,  3.72307593, ..., 22.83763947,
        58.70907407, 45.4223399 ]])

In [None]:
 #SN比
sn = np.zeros(l8.shape[0])
for idx, row in enumerate(result):
    sum_MD = 0
    for i in range(len(row)):
        sum_MD += 1 / row[i]
    sn[idx] = -10 * math.log10(sum_MD / len(row))
sn

array([11.87847325, 11.75508921, 10.78825781,  8.59781842,  9.72059755,
        9.9537332 ,  9.3703629 , 10.79446388])

In [None]:
X_train

Unnamed: 0,5,10,14,13,2,16,12
9877,1,7,7,2,0,9,8
8430,4,6,8,2,7,8,6
322,5,11,6,4,10,9,7
14003,3,14,9,2,10,11,6
3407,3,12,9,2,13,6,5
...,...,...,...,...,...,...,...
450,2,10,9,5,4,7,6
18925,0,7,10,3,0,8,8
5789,8,6,8,8,8,6,8
1555,1,7,8,2,3,8,7


In [None]:
# SN比を利用し，不要と思われる変数を削除する
#変数選択
df_sn = pd.DataFrame(index=X_train.columns, columns=['SN比'])
for i, clm in enumerate(X_train.columns):
    df_sn.loc[df_sn.index == clm, 'SN比'] = sum(sn[l8.T[i]]) - sum(sn[~l8.T[i]])
#使用した変数を保存
select_columns = df_sn[df_sn['SN比'] > 0].index
df_sn
    

Unnamed: 0,SN比
5,3.180481
10,3.75699
14,4.737982
13,0.656587
2,3.97106
16,-0.87609
12,-3.258021


In [None]:
df_sn.loc[select_columns]

Unnamed: 0,SN比
5,3.180481
10,3.75699
14,4.737982
13,0.656587
2,3.97106


In [None]:
weight = (df_sn[df_sn['SN比'] > 0] / df_sn[df_sn['SN比'] > 0].sum()).values
weight

array([[0.19508443552654312],
       [0.23044636152888157],
       [0.2906184784888307],
       [0.040273739076915734],
       [0.24357698537882888]], dtype=object)

In [None]:
weight[0] = weight[0] * 2 + 5

In [None]:
weight

array([[5.390168871053087],
       [0.23044636152888157],
       [0.2906184784888307],
       [0.040273739076915734],
       [0.24357698537882888]], dtype=object)

In [None]:
weight[0] * 2 + 5

array([15.780337742106173], dtype=object)

In [None]:
select_columns

Int64Index([5, 10, 14, 13, 2], dtype='int64')

## このSN比を重みとして加重マハラノビス距離を計算する

In [None]:
result_scaler = StandardScaler()
result_scaler.fit(X_train[select_columns][y == 0])
result_Z = result_scaler.transform(X_train[select_columns][y == 0])
result_Z

  result_scaler.fit(X_train[select_columns][y == 0])
  result_Z = result_scaler.transform(X_train[select_columns][y == 0])


array([[-1.14824355, -0.5155805 , -0.94884703, -0.44846422, -2.1298689 ],
       [ 0.21215746, -0.91363479, -0.28023823, -0.44846422, -0.01223632],
       [ 0.66562446,  1.07663663, -1.61745583,  0.39931897,  0.8953205 ],
       ...,
       [ 2.02602547, -0.91363479, -0.28023823,  2.09488534,  0.29028262],
       [-1.14824355, -0.5155805 , -0.28023823, -0.44846422, -1.22231208],
       [-1.14824355, -0.5155805 , -0.28023823, -1.29624741,  0.8953205 ]])

In [None]:
weight

array([[5.390168871053087],
       [0.23044636152888157],
       [0.2906184784888307],
       [0.040273739076915734],
       [0.24357698537882888]], dtype=object)

In [None]:
#グラムシュミット法による加重マハラノビス汎距離
def weighted_gram_schmidt_cal_MD(Z, weight, ips):
    '''
    Z:標準化したベクトル
    '''
    gram_vec, _ = np.linalg.qr(Z)
    k = gram_vec.shape[1]
    MD = np.zeros(len(Z))
    
    for i, one_gram_vec in enumerate(gram_vec):
        _MD = 0
        for q, u in enumerate(one_gram_vec):
            _MD += weight[q] * (u**2 / ips[q])
        _MD = _MD / k
        MD[i] = _MD
    return MD
Z = result_scaler.transform(X_test[select_columns])
MD = weighted_gram_schmidt_cal_MD(Z, weight, normal_ips)
MD

array([10.73647859,  2.90167245,  3.14508558, ...,  2.17492582,
        2.96060483,  2.60504555])

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test.values, weighted_gram_schmidt_cal_MD(Z, weight, normal_ips)))
print(roc_auc_score(y_test.values, gram_schmidt_cal_MD(Z, normal_ips)))

0.5420499930636176
0.5439766645492446


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
clf = LogisticRegression(solver="liblinear", random_state=0).fit(use_X, y)
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.8606407757065982

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

array([[3847,   11],
       [ 136,    6]], dtype=int64)