# グラムシュミット直交化法完成

In [1]:
import pandas as pd
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy.stats import chi2
import matplotlib.dates as mdates
import random
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/letter_recognition.csv', header=None)

#Aのみを判定するため，Aを１，A以外を0にした．
df[0] = df[0].apply(lambda x: 1 if x == 'A' else 0)

#Xとyを入力
X = df[range(1,17)]
y = df[0]
#バギング側の話
#ブートストラップサンプリングの個数
n = 10
seed = random.randint(0, n)

#使用する7つの変数をランダムに取得する
random.seed(1)
random_s = random.sample(list(X.columns), 7)
use_X = X[random_s]

X_train, X_test, y_train, y_test = train_test_split(use_X, y, test_size=0.2)




In [3]:
normal_idx = y_train[y_train == 0].index.to_list()
anomaly_idx = y_train[y_train == 1].index.to_list()

#通常状態の説明変数と目的変数を定義
normal_X = X_train.loc[normal_idx]
normal_y = y_train.loc[normal_idx]
anomaly_X = X_train.loc[anomaly_idx]
anomaly_y = y_train.loc[anomaly_idx]

In [28]:
#各説明変数ごとに標準化
def transform_standard(fit_X, transform_X):
    scaler = StandardScaler()
    scaler.fit(fit_X)
    return scaler.transform(transform_X)


# グラムシュミットの直交化
def fit_gram_schmidt(normal_Z):
    '''
    グラムシュミット係数 t と normal_U の相関行列の対角成分 ipsを出力する関数

    入力: 
        normal_Z: 正常データの標準化したサンプルベクトル  
    '''
    normal_A = normal_Z.T
    # tの算出
    t = np.zeros((normal_A.shape[0], normal_A.shape[0]))
    normal_U = np.zeros(normal_A.shape)
    for l in range(normal_A.shape[0]):
        sigma = 0
        for q in range(l):
            t[l][q] = np.dot(normal_A[l], normal_U[q]) / np.dot(normal_U[q], normal_U[q])
            sigma +=  t[l][q] * normal_U[q]
        normal_U[l] = normal_A[l] - sigma
    # ipsの算出
    ips = np.diag(np.cov(normal_U))

    return t, ips

def create_gram_vec_U(Z, t):
    '''
    gramschmidt特徴ベクトル U を出力する関数

    入力: 
        Z: 標準化したサンプルベクトル
        t; グラムシュミット係数
    '''
    A = Z.T
    U = np.zeros(A.shape)
    for l in range(A.shape[0]):
        sigma = 0 
        for q in range(l):
            sigma +=  t[l][q] * U[q]
        U[l] = A[l] - sigma
    return U

def gram_schmidt_cal_MD(U, ips, feature_weight=[1.0]*100):
    '''
    gramschmidt_MD を出力する関数

    入力:
        U.T: gramschmidtサンプルベクトル 
        ips: U の相関行列の対角成分 
        feature_weight: 変数の重み，デフォルトはすべて１
    '''
    
    k = U.shape[1]
    MD = np.zeros(U.shape[0])
    
    for i, one_U in enumerate(U):
        sigma_MD = 0
        for q, u in enumerate(one_U):
            sigma_MD += feature_weight[q] * (u**2 / ips[q])
        sigma_MD = sigma_MD / k
        MD[i] = sigma_MD
    return MD

In [29]:
#標準化
normal_Z = transform_standard(normal_X, normal_X)
anomaly_Z = transform_standard(normal_X, anomaly_X)
t, ips = fit_gram_schmidt(normal_Z)

normal_U = create_gram_vec_U(normal_Z, t)
print(gram_schmidt_cal_MD(normal_U.T, ips))

anomaly_U = create_gram_vec_U(anomaly_Z, t)
print(gram_schmidt_cal_MD(anomaly_U.T, ips))



[0.90691008 1.68090017 0.16039911 ... 0.17040539 1.0268533  1.53254715]
[3.90929685 0.53442815 0.24454832 0.681618   0.49473021 0.57484444
 0.57484444 1.12762751 1.03190149 0.99078452 1.80144238 1.87599604
 1.14385371 0.42729832 0.48027337 0.61906806 0.38981255 0.61617635
 0.77385691 0.73153365 0.85327579 0.5366331  0.75010157 0.34973859
 0.33757394 0.89387869 0.2250977  0.72781946 0.22932327 0.38473634
 1.01286837 1.21042555 0.47128176 0.5734787  1.23465234 0.3516622
 0.81407166 2.28142705 0.68394122 0.59425482 0.82121875 0.87771352
 2.52431634 0.40309037 0.33538216 0.69717097 0.9517382  2.26996747
 1.12762751 0.68897353 2.62446671 0.76694173 0.80589805 1.07009231
 1.31308896 0.84988683 2.56336858 1.12762751 0.45851269 2.92318222
 0.681618   1.22201743 0.85327579 0.35271846 3.13683487 0.46492325
 0.67135822 0.79471171 0.56707749 0.73153365 0.61603553 0.48027337
 0.8149603  0.76700103 0.72682732 0.53200966 0.52408178 0.59455064
 0.49473021 0.8531975  0.56021819 0.79419809 0.88623013 0.

In [33]:
l8 = np.array([
        [1,1,1,1,1,1,1],
        [1,1,1,2,2,2,2],
        [1,2,2,1,1,2,2],
        [1,2,2,2,2,1,1],
        [2,1,2,1,2,1,2],
        [2,1,2,2,1,2,1],
        [2,2,1,1,2,2,1],
        [2,2,1,2,1,1,2]
        ])
l8 = (l8 == 1)

In [35]:
normal_U.T[:, l8[1]]

array([[ 0.67020631, -0.86420426, -0.25699468],
       [-1.14758923,  2.1646827 , -1.67072684],
       [-0.69314034,  0.21546102, -0.28326795],
       ...,
       [ 0.67020631,  0.32779567, -0.28924871],
       [ 1.57910408, -0.39198118, -0.2717332 ],
       [ 1.1246552 ,  1.9545738 ,  0.3336002 ]])

In [37]:
ips[l8[1]]

array([1.00006512, 0.99327554, 0.99933321])

In [17]:
anomaly_A = anomaly_Z.T
anomaly_U = create_gram_vec_U(anomaly_A, t)
gram_schmidt_cal_MD(anomaly_U.T, ips)

array([3.90929685, 0.53442815, 0.24454832, 0.681618  , 0.49473021,
       0.57484444, 0.57484444, 1.12762751, 1.03190149, 0.99078452,
       1.80144238, 1.87599604, 1.14385371, 0.42729832, 0.48027337,
       0.61906806, 0.38981255, 0.61617635, 0.77385691, 0.73153365,
       0.85327579, 0.5366331 , 0.75010157, 0.34973859, 0.33757394,
       0.89387869, 0.2250977 , 0.72781946, 0.22932327, 0.38473634,
       1.01286837, 1.21042555, 0.47128176, 0.5734787 , 1.23465234,
       0.3516622 , 0.81407166, 2.28142705, 0.68394122, 0.59425482,
       0.82121875, 0.87771352, 2.52431634, 0.40309037, 0.33538216,
       0.69717097, 0.9517382 , 2.26996747, 1.12762751, 0.68897353,
       2.62446671, 0.76694173, 0.80589805, 1.07009231, 1.31308896,
       0.84988683, 2.56336858, 1.12762751, 0.45851269, 2.92318222,
       0.681618  , 1.22201743, 0.85327579, 0.35271846, 3.13683487,
       0.46492325, 0.67135822, 0.79471171, 0.56707749, 0.73153365,
       0.61603553, 0.48027337, 0.8149603 , 0.76700103, 0.72682

In [20]:
#共分散行列の逆行列
def inv_cov(x):
    #標準化後のベクトルを入力する
    #標準化した後なので相関行列と分散共分散行列は一致する
    cov = np.cov(x.T)
    return np.linalg.inv(cov)

#マハラノビス汎距離
def cal_MD(Z, inv_C):
    '''
    Z:標準化したベクトル
    inv_C:標準化後の共分散行列
    '''
    MD = np.zeros(len(Z))
    for i in range(len(Z)):
        _a = np.dot(Z[i], inv_C)
        _MD = np.dot(_a, Z[i].T)
        _MD = _MD / Z.shape[1]
        MD[i] = _MD
    return MD

inv_C = inv_cov(normal_Z)
cal_MD(normal_Z, inv_C)
cal_MD(anomaly_Z, inv_C)

array([3.90929685, 0.53442815, 0.24454832, 0.681618  , 0.49473021,
       0.57484444, 0.57484444, 1.12762751, 1.03190149, 0.99078452,
       1.80144238, 1.87599604, 1.14385371, 0.42729832, 0.48027337,
       0.61906806, 0.38981255, 0.61617635, 0.77385691, 0.73153365,
       0.85327579, 0.5366331 , 0.75010157, 0.34973859, 0.33757394,
       0.89387869, 0.2250977 , 0.72781946, 0.22932327, 0.38473634,
       1.01286837, 1.21042555, 0.47128176, 0.5734787 , 1.23465234,
       0.3516622 , 0.81407166, 2.28142705, 0.68394122, 0.59425482,
       0.82121875, 0.87771352, 2.52431634, 0.40309037, 0.33538216,
       0.69717097, 0.9517382 , 2.26996747, 1.12762751, 0.68897353,
       2.62446671, 0.76694173, 0.80589805, 1.07009231, 1.31308896,
       0.84988683, 2.56336858, 1.12762751, 0.45851269, 2.92318222,
       0.681618  , 1.22201743, 0.85327579, 0.35271846, 3.13683487,
       0.46492325, 0.67135822, 0.79471171, 0.56707749, 0.73153365,
       0.61603553, 0.48027337, 0.8149603 , 0.76700103, 0.72682