In [10]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import KFold
from sklearn.mixture import BayesianGaussianMixture
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor

In [17]:
# 近傍有効距離の計算
def reachability_dist(knn, xs, xs_idx, xt, xt_idx):
    xs_distk, xs_idxk = knn.kneighbors([xs])
    xt_distk, xt_idxk = knn.kneighbors([xt])
    if xs_idx in xt_idxk and xt_idx in xs_idxk:
        return np.max(xt_distk)
    else:
        return np.sum((xs - xt)**2)**0.5

def local_outlier_factor(X_train, X_test, num_k=5):
    # k近傍法
    knn = NearestNeighbors(n_neighbors=num_k)
    knn.fit(X_train)
    # テストデータの計算
    anom_score = []
    for i, x1 in X_test.iterrows():
        # x1 のk近傍計算
        x1_distk, x1_idxk = knn.kneighbors([x1])
        # 有効近傍距離の計算
        numerator = 0
        for j in x1_idxk[0]:
            x2 = X_train.iloc[j]
            numerator += reachability_dist(knn, x1, i,  x2, j)
        numerator /= num_k
        denominator = 0
        for j in x1_idxk[0]:
            x2 = X_train.iloc[j]
            x2_distk, x2_idxk = knn.kneighbors([x2])
            for k in x2_idxk[0]:
                x3 = X_train.iloc[k]
                denominator += numerator / reachability_dist(knn, x2, j,  x3, k) / num_k
        a_x1 = denominator / num_k
        anom_score.append(a_x1)
    return anom_score

def break_even(a, y_test):
    """ 分岐点精度の計算 """
    # 正常標本精度と異常標本精度の計算
    y_test.reset_index(drop=True, inplace=True) # インデックスリセット
    idx = a.argsort()[::-1] # 降順のインデックス計算
    n_total = len(y_test)
    n_anom = sum(y_test)
    n_norm = n_total - n_anom
    coverage = np.zeros(n_total) # 異常標本精度
    detection = np.zeros(n_total) # 正常標本精度
    for i in range(n_total):
        n_detected_anom = sum(y_test[idx][:i])
        n_detected_norm = n_total - i - sum(y_test[idx][i:])
        coverage[i] = n_detected_anom / n_anom
        detection[i] = n_detected_norm / n_norm

    # 分岐点精度の計算
    thresh = 0
    for i, (c_score, d_score) in enumerate(zip(coverage, detection)):
        if c_score >= d_score:
            thresh = i
            break
    break_even_point = a[idx][thresh]
    print(break_even_point, c_score, d_score)
    return (break_even_point, c_score), (coverage, detection)

In [5]:
# ファイル読み込み
IN_FILE = '../python_codes/data/SkillCraft1_Dataset.csv'
df = pd.read_csv(IN_FILE, header=0, index_col=None, sep=',')

In [6]:
# '?'を含む行を削除
df = df[(df=='?').sum(axis=1)==0]

In [7]:
# シャッフルとインデックス更新
df = df.sample(frac=1, random_state=0) # 行シャッフル
df = df.reset_index(drop=True) # インデックスの更新

In [8]:
# データセットの作成
target_col = 'LeagueIndex'
target = 1
del_col = [target_col, 'GameID']
X = df.drop(del_col, axis=1)
y = (df[target_col]==target).astype(np.int32)
print(X.shape)

(3338, 18)


# 異常検出（5分割交差検証による評価）

In [7]:
%%time
# 交差確認
scores = []
kf = KFold(n_splits=5, random_state=0, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 混合正規分布による異常検知
    # →平均
    anomaly_score = local_outlier_factor(X_train[y_train==0].astype(np.float64), X_test.astype(np.float64), 15)
    (break_even_point, score), (coverage, detection) = break_even(np.array(anomaly_score), y_test)
    scores.append(score)
print('average: {}'.format(sum(scores)/len(scores)))

0.9510966448883394 0.6666666666666666 0.6426332288401254
0.9441844571560967 0.6285714285714286 0.627172195892575
1.0022840928188186 0.71875 0.7154088050314465
0.9302788944687234 0.5757575757575758 0.5615141955835962
0.9826327257592379 0.7027027027027027 0.6936507936507936
average: 0.6584896747396747
Wall time: 7min 12s


## scikit-learnのLOF

In [15]:
lof = LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30)
lof.fit(X)
lof.fit_predict(X)
lof._decision_function(X)

array([-0.99181867, -1.01836697, -1.05624588, ..., -2.17461449,
       -0.99931323, -0.98966341])

In [20]:
%%time
# 交差確認
scores = []
lof = LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30)
kf = KFold(n_splits=5, random_state=0, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 混合正規分布による異常検知
    # →平均
    lof.fit(X_train)
    anomaly_score = lof._decision_function(X)
    (break_even_point, score), (coverage, detection) = break_even(np.array(anomaly_score), y_test)
    scores.append(score)
print('average: {}'.format(sum(scores)/len(scores)))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


-0.9305283238616845 nan nan
-0.9436144461929311 nan nan
-0.9424459876360711 nan nan
-0.9389112543888773 nan nan
-0.9513977332961348 nan nan
average: nan
Wall time: 4.64 s
