In [1]:
import warnings
# warnings.filterwarnings("ignore", message="numpy.dtype size changed")
# warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import KFold
# from MTS import *
# from orthogonal_array import *
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity

In [2]:
def break_even(a, y_test):
    """ 分岐点精度の計算 """
    # 正常標本精度と異常標本精度の計算
    y_test.reset_index(drop=True, inplace=True) # インデックスリセット
    idx = a.argsort()[::-1] # 降順のインデックス計算
    n_total = len(y_test)
    n_anom = sum(y_test)
    n_norm = n_total - n_anom
    coverage = np.zeros(n_total) # 異常標本精度
    detection = np.zeros(n_total) # 正常標本精度
    for i in range(n_total):
        n_detected_anom = sum(y_test[idx][:i])
        n_detected_norm = n_total - i - sum(y_test[idx][i:])
        coverage[i] = n_detected_anom / n_anom
        detection[i] = n_detected_norm / n_norm

    # 分岐点精度の計算
    thresh = 0
    for i, (c_score, d_score) in enumerate(zip(coverage, detection)):
        if c_score >= d_score:
            thresh = i
            break
    break_even_point = a[idx][thresh]
    print(break_even_point, c_score, d_score)
    return (break_even_point, c_score), (coverage, detection)

In [3]:
# ファイル読み込み
IN_FILE = '../data/SkillCraft1_Dataset.csv'
df = pd.read_csv(IN_FILE, header=0, index_col=None, sep=',')

In [4]:
# '?'を含む行を削除
df = df[(df=='?').sum(axis=1)==0]

In [5]:
# シャッフルとインデックス更新
df = df.sample(frac=1, random_state=0) # 行シャッフル
df = df.reset_index(drop=True) # インデックスの更新

In [6]:
# データセットの作成
target_col = 'LeagueIndex'
target = 1
del_col = [target_col, 'GameID']
X = df.drop(del_col, axis=1)
y = (df[target_col]==target).astype(np.int32)
print(X.shape)

(3338, 18)


# 異常検出（5分割交差検証による評価）

In [7]:
%%time
# 交差確認
scores = []
params = {'bandwidth': np.logspace(-1, 1, 21)}
grid = GridSearchCV(KernelDensity(), params)
kf = KFold(n_splits=5, random_state=0, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 混合正規分布による異常検知
    # →平均0.3957
    # カーネル密度推定の実行
    grid.fit(X_train[y_train==0])
    kde = grid.best_estimator_
    anomaly_score = -kde.score_samples(X_test)
    (break_even_point, score), (coverage, detection) = break_even(anomaly_score, y_test)
    anomaly_score_abnormal = anomaly_score[y_test==1]
    scores.append(score)
print('average: {}'.format(sum(scores)/len(scores)))

66.02823561707095 0.6666666666666666 0.664576802507837
65.81452228564487 0.6285714285714286 0.6050552922590837
66.03896416875386 0.6875 0.6871069182389937
65.87276846342043 0.6666666666666666 0.637223974763407
66.29873459576447 0.7297297297297297 0.7142857142857143
average: 0.6758268983268982
Wall time: 1min 6s
