# IsolationForestについて

- ラベルなし異常検知アルゴリズムの一つ

  - 与えたデータからランダムに特徴を選び，ランダムに分割する点を選ぶ，というようにデータを分割することで，異常値を含むデータを除外(isolate)するアルゴリズム．

  - Random Forestと同様に，データをサブサンプリングして木を作る，木は複数個(default:100)作る＝森，というモデルをロバストにする工夫もしているアルゴリズム．

- 最終的には，各データ点に対して，ルートノードから距離（木の深さ）を，全ての木（森）に対して平均する．

    この値が異常スコアに対応している．

## 異常スコアの計算方法
    平均深さをそのまま使っているのではなく，その木で二部探索する際の平均深さで正規化している．

    このように正規化することで，異常スコアを(0,1)区間に収めることができて嬉しい．

- Isolation Forestは教師あり学習ではないけれど，訓練データと評価データが存在する．

    - 訓練では，IsolationForestの森を作成する．
    
    - 評価では，データを森に入れて，各データ点の異常スコアを取得する．
    
## 今回の実験

  - 50個(内2個経年劣化)のFPGAデータを使用.
  
  - データはそれぞれ4884個の残差特徴量を持つ．
    
  - IsolationForestの木の数を100,1000と変えて実験.

In [3]:
from sklearn.ensemble import IsolationForest

In [4]:
import pandas as pd
import generate_nnr_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import seaborn as sns
from statistics import mean, variance
from scipy import stats

In [5]:
def csv_to_data(directory, data_n):
    data = []
    
    for i in range(3, data_n+1):
        tmp_data = pd.read_csv(directory+'/s'+str(i)+'.csv', header=None).values
        data.append(tmp_data)
    
    return np.array(data)


def csv_to_aged_data(directory, aged_data_n):
    aged_data = []

    for i in range(1, aged_data_n+1):
        tmp_data = pd.read_csv(directory+'/s'+str(i)+'_aged.csv', header=None).values
        aged_data.append(tmp_data)
        
    return np.array(aged_data)


def generate_data(directory, data_n, aged_data_n):
    data = csv_to_data(directory, data_n)
    aged_data = csv_to_aged_data(directory, aged_data_n)
    
    return data, aged_data


def generate_residual_data(data_n, data):
    """
    測定値-推定値(周りの平均)
    残差を求めてデータ生成
    """
    tmp_x = [0, 1, 0, -1]
    tmp_y = [-1, 0, 1, 0]

    residual_data = np.zeros_like(data)

    for i in range(data_n):
        for j in range(data[i].shape[0]):
            for k in range(data[i].shape[1]):
                data_list = []
                for l in range(4):
                    next_y = j + tmp_y[l]
                    next_x = k + tmp_x[l]
                    if 0 <= next_y < 148 and 0 <= next_x < 33:
                        data_list.append(data[i, next_y, next_x])

                data_mean = mean(data_list)
                residual_data[i, j, k] = data[i, j, k] - data_mean

    return residual_data


def generate_nnr(data_n=50, aged_data_n=2):
    """
    残差集合のデータ生成
    """
    data, aged_data = generate_data('fresh_aged_ieice', data_n, aged_data_n)

    residual_data = generate_residual_data(data_n-2, data)
    aged_residual_data = generate_residual_data(aged_data_n, aged_data)

    return residual_data, aged_residual_data

In [6]:
residual_data, aged_residual_data = generate_nnr()

data = []
for i in range(48):
    data.append(residual_data[i].flatten())
    
for i in range(2):
    data.append(aged_residual_data[i].flatten())
    
data = np.array(data)

import csv # csvモジュールをインポート

file = open('new.csv', 'w')    #既存でないファイル名を作成してください
w = csv.writer(file)
w = w.writerows(data)
 
file.close()

In [7]:
dataset = pd.read_csv("./new.csv", header=None)
orig = dataset.copy()
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4874,4875,4876,4877,4878,4879,4880,4881,4882,4883
0,2.48775,-2.279433,1.000967,1.4769,1.874133,-0.688033,-0.436433,39.998233,-38.753767,0.0,...,1.116833,0.281233,0.8139,-0.6626,38.373933,-38.3024,0.0,0.0,0.0,0.0
1,1.7678,-1.852233,0.907767,1.061767,1.8017,0.4512,-0.556533,41.515533,-40.9928,0.0,...,0.792267,1.103433,0.470733,-0.791733,40.1131,-39.741767,0.0,0.0,0.0,0.0
2,2.7298,-3.1781,1.0412,1.7057,1.934533,-0.177067,-0.194433,41.583867,-41.2845,0.0,...,0.695167,0.254133,0.8565,-0.9051,41.089233,-40.769633,0.0,0.0,0.0,0.0
3,2.05035,-2.0777,1.0933,1.207767,1.7112,-0.326367,0.855867,41.642467,-41.111167,0.0,...,0.851633,0.9651,-0.296233,-0.008233,40.1906,-40.1376,0.0,0.0,0.0,0.0
4,2.4741,-2.858067,1.2018,1.825367,0.946067,0.558767,-0.3139,40.102167,-39.803533,0.0,...,0.710367,0.497367,1.3157,-1.2069,38.6519,-38.844967,0.0,0.0,0.0,0.0
5,1.9491,-2.672767,0.540733,1.6046,1.0587,0.546367,0.1041,41.160467,-40.400567,0.0,...,0.113367,0.972233,0.340067,-0.315167,40.714967,-40.276667,0.0,0.0,0.0,0.0
6,2.05115,-2.647867,1.028033,1.819867,0.940633,-0.602833,1.095267,41.167667,-40.393567,0.0,...,0.980067,0.644433,0.588333,-0.6806,39.7294,-39.423933,0.0,0.0,0.0,0.0
7,1.92485,-3.477433,1.3499,1.650067,1.081167,-0.278867,0.710567,40.5503,-40.063567,0.0,...,2.073833,0.043633,1.409867,-1.5563,40.0436,-39.8195,0.0,0.0,0.0,0.0
8,3.07685,-3.6063,2.017067,0.612733,1.4275,0.0392,0.848433,42.117433,-41.815267,0.0,...,0.945567,0.62,0.901867,-1.173767,41.296833,-41.053133,0.0,0.0,0.0,0.0
9,1.6182,-1.199567,0.1319,1.164067,1.8041,-0.1241,-0.138267,38.7844,-38.057433,0.0,...,1.6713,-0.498433,0.7215,-0.5959,37.2603,-37.2764,0.0,0.0,0.0,0.0


In [25]:
iforest = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, behaviour='deprecated', random_state=None, verbose=0, warm_start=False)
iforest.fit(dataset)
iforest_result = iforest.decision_function(dataset)
iforest_result

array([ 0.06225912,  0.04909049,  0.0295603 ,  0.0663096 ,  0.06197065,
        0.05211347,  0.0490397 ,  0.01427283,  0.01082967,  0.02922558,
        0.0425606 ,  0.03061205,  0.02006688,  0.05838746,  0.04358934,
        0.04523684,  0.05218248,  0.01444962,  0.02366167,  0.04765974,
        0.05919141,  0.03663748, -0.00034809,  0.03761842,  0.01932865,
        0.01939375,  0.04143669,  0.04264991,  0.05192083,  0.07036606,
        0.03561843,  0.05644769,  0.04381236,  0.03282705,  0.03386597,
        0.00130951,  0.03273071,  0.04014182,  0.05405859,  0.04899911,
        0.05436678,  0.03776946,  0.0679537 ,  0.05553679,  0.05782823,
        0.03583699,  0.03426211,  0.03334055, -0.01487604, -0.01151067])

In [43]:
iforest_orig = dataset.copy()
iforest_orig['if'] = iforest_result
x = np.zeros(50)
x[-1] = 1
x[-2] = 1
iforest_orig['class'] = x

In [44]:
iforest_top5_data=iforest_orig.sort_values(by=['if'],ascending=True)[:5]
iforest_top5_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4876,4877,4878,4879,4880,4881,4882,4883,if,class
48,0.9939,-1.215267,-0.099033,1.8056,0.568867,0.379567,-0.027667,35.704933,-35.528967,0.0,...,-0.4752,0.6657,34.700167,-34.7951,0.0,0.0,0.0,0.0,-0.034015,1.0
49,1.4739,-2.339467,1.256933,1.066133,0.84,0.793833,-0.733867,36.33,-35.965567,0.0,...,0.8381,-0.301367,35.695367,-35.931133,0.0,0.0,0.0,0.0,-0.006634,1.0
35,1.3136,-1.756367,0.181367,2.235533,0.535567,-0.020867,0.3971,36.3062,-36.2232,0.0,...,0.524833,-0.9138,35.882933,-35.760067,0.0,0.0,0.0,0.0,0.000264,0.0
8,3.07685,-3.6063,2.017067,0.612733,1.4275,0.0392,0.848433,42.117433,-41.815267,0.0,...,0.901867,-1.173767,41.296833,-41.053133,0.0,0.0,0.0,0.0,0.01207,0.0
25,2.7093,-2.7346,0.876633,1.856867,0.4452,0.395833,-0.474767,38.422633,-37.313767,0.0,...,1.285233,-0.842167,35.7533,-35.883567,0.0,0.0,0.0,0.0,0.017907,0.0


In [45]:
len(iforest_top5_data[lambda x:x['class']==1])

2

## 100結果
- 48, 49が異常だと判定しており，概ね正しい．
- しかし，IsolationForestがランダム値を扱うアルゴリズムなので，測定ごとに結果は異なる．
- 48, 49は上位５位に大体入るが，ばらつきはある．(詳細に調べていない)

In [50]:
iforest = IsolationForest(n_estimators=1000, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, behaviour='deprecated', random_state=None, verbose=0, warm_start=False)
iforest.fit(dataset)
iforest_result = iforest.decision_function(dataset)
iforest_orig = dataset.copy()
iforest_orig['if'] = iforest_result
x = np.zeros(50)
x[-1] = 1
x[-2] = 1
iforest_orig['class'] = x
iforest_top5_data=iforest_orig.sort_values(by=['if'],ascending=True)[:5]
iforest_top5_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4876,4877,4878,4879,4880,4881,4882,4883,if,class
48,0.9939,-1.215267,-0.099033,1.8056,0.568867,0.379567,-0.027667,35.704933,-35.528967,0.0,...,-0.4752,0.6657,34.700167,-34.7951,0.0,0.0,0.0,0.0,-0.033764,1.0
49,1.4739,-2.339467,1.256933,1.066133,0.84,0.793833,-0.733867,36.33,-35.965567,0.0,...,0.8381,-0.301367,35.695367,-35.931133,0.0,0.0,0.0,0.0,-0.006005,1.0
35,1.3136,-1.756367,0.181367,2.235533,0.535567,-0.020867,0.3971,36.3062,-36.2232,0.0,...,0.524833,-0.9138,35.882933,-35.760067,0.0,0.0,0.0,0.0,-0.003122,0.0
8,3.07685,-3.6063,2.017067,0.612733,1.4275,0.0392,0.848433,42.117433,-41.815267,0.0,...,0.901867,-1.173767,41.296833,-41.053133,0.0,0.0,0.0,0.0,0.00868,0.0
17,0.62275,-1.2865,0.6921,0.7453,1.715633,0.033433,-0.188533,37.577767,-36.732733,0.0,...,0.025167,-0.083233,35.073067,-35.889267,0.0,0.0,0.0,0.0,0.010476,0.0


In [47]:
len(iforest_top5_data[lambda x:x['class']==1])

2

## 1000結果
- ほぼ完璧に48と49を当てる．

## 疑問点
     異常スコアの値は[0,1]の間におさまるはずだが，おさまっていないデータがあった．（0以下の値)
     
     そのデータを異常値と判定している．