# XP教師なし異常検知

# k-means
    k-means法は,データを適当なクラスタに分け,クラスタの平均を用いてデータを分け，クラスタリングするアルゴリズム.
    任意のk個のクラスタを作成するアルゴリズムなので、k-means法(k点平均法)と呼ばれる.

## 具体的な手順
1. クラスタ数kを決める
2. データが含まれる空間にランダムにk個の点(セントロイド)を置く
3. 各データがセントロイドのうちどれに最も近いかを計算して、そのデータが所属するクラスタとする
4. セントロイドの位置をそのクラスタに含まれるデータの重心になるように移動する
5. 各セントロイドの重心が変わらなくなるまで3, 4を繰り返す

## k-meansの問題点
    ランダムな初期値に依存する．　-> 効率的にクラスタリングできない可能性がある．
    
----------------------------------------------------------------
    
# k-means++
    初期のクラスター中心を確率的に遠く設置するという発想.
    k-meansと異なる点は，セントロイドの初期値.
   
## 具体的な初期値決定手順

1. 始めにデータ点をランダムに選び1つ目のセントロイドとする．
2. 全てのデータ点とその最近傍のセントロイドの距離を求める．
3. その距離の二乗に比例した確率で,選ばれていないデータ点をセントロイドとしてランダムに選ぶ．

### 初期値が決まったのちは,通常のk-meansアルゴリズムを適用

-------------------------------------------------------------------

## XP手法
    XPの教師なし異常検知では，シルエットプロットを用いて最適なクラスタ数を決定する．
    測定したいFPGAそれぞれ，周波数を計測しk-meansを用いてクラスタリングしたのち，シルエットプロットで最適なクラスタ数を決定する．
    この最適なクラスタ数が，新しいFPGAでは少なく，使用済みFPGAは多くなるという仮説のもと，実験を進める．
    最適なクラスタ数を求めたのち，適切なクラスタ数閾値を設定して，使用済みFPGAを見分ける．


In [27]:
"""ライブラリ"""
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import seaborn as sns
from statistics import mean, variance
from scipy import stats
from scipy.stats import norm
from collections import Counter
import copy

class FPGA:
    def __init__(self,frequency, cluster, center):
        self.frequency = frequency
        self.cluster = cluster
        self.center = center

"""データ生成関数"""
def generate_data(directory, data_n, aged_data_n):
    data = []
    aged_data = []
    
    for i in range(1, data_n+1):
        tmp_data = pd.read_csv(directory+'/s'+str(i)+'.csv', header=None).values
        data.append(tmp_data)
    
    for i in range(1, aged_data_n+1):
        tmp_data = pd.read_csv(directory+'/s'+str(i)+'_aged.csv', header=None).values
        aged_data.append(tmp_data)
    
    data = np.array(data)
    aged_data = np.array(aged_data)
    
    return data, aged_data

newdata, ageddata = generate_data('fresh_aged_ieice', 50, 2)


"""0の数数える関数 二次元入れろ"""
def count_zero(data):
    
    tmp = []
    for i in range(data.shape[0]):
        a = (data[i].shape[0] * data[i].shape[1]) - np.count_nonzero(data[i])
        tmp.append(a)
        
    return tmp


"""一次元にする関数"""
def change_flatten(data):
    tmp = []
    for i in range(data.shape[0]):
        tmp.append(data[i].flatten())
        
    tmp = np.array(tmp)
    
    return tmp


"""0を消す関数 flatteしたやつ入れろ"""
def delete_zero(data):
    tmp = []
    for i in range(data.shape[0]):
        tmp2 = copy.deepcopy(data[i])
        tmp.append(tmp2[tmp2 != 0])
        
    tmp = np.array(tmp)
    
    return tmp

"""９２０この値を消す"""
def delete_920(data, check):
    new = np.zeros_like(data)
    counter = 0
    for i in data:
        for j in range(i.shape[0]):
            for k in range(i.shape[1]):
                if [j, k] in check:
                    new[counter, j, k] = 0 
                else:
                    new[counter, j, k] = data[counter, j, k]
                    
        counter += 1
                    
    return new
                    

"""東西南北残差作り出す関数"""
def EWSN_residual(data):
    tmp_x = [0, 1, 0, -1]
    tmp_y = [-1, 0, 1, 0]

    residual_data = np.zeros_like(data)

    for i in range(data.shape[0]):
        for j in range(data[i].shape[0]):
            for k in range(data[i].shape[1]):
                if data[i, j, k] != 0:
                    data_list = []
                    for l in range(4):
                        next_y = j + tmp_y[l]
                        next_x = k + tmp_x[l]
                        if 0 <= next_y < 148 and 0 <= next_x < 33 and data[i, next_y, next_x] != 0:
                            data_list.append(data[i, next_y, next_x])
    
                    data_mean = mean(data_list)
                    residual_data[i, j, k] = np.abs(data[i, j, k] - data_mean)

    return residual_data


"""king残差作り出す関数"""
def king_residual(data):
    tmp_x = [-1, 0, 1, 1, 1, 0, -1, -1]
    tmp_y = [-1, -1, -1, 0, 1, 1, 1, 0]
    
    residual_data = np.zeros_like(data)
    
    for i in range(data.shape[0]):
        for j in range(data[i].shape[0]):
            for k in range(data[i].shape[1]):
                if data[i, j, k] != 0:
                    data_list = []
                    for l in range(8):
                        next_y = j + tmp_y[l]
                        next_x = k + tmp_x[l]
                        if 0 <= next_y < 148 and 0 <= next_x < 33 and data[i, next_y, next_x] != 0:
                            data_list.append(data[i, next_y, next_x])
    
                    data_mean = mean(data_list)
                    residual_data[i, j, k] = np.abs(data[i, j, k] - data_mean)

    return residual_data

    
"""リストを繋げる関数"""
def connect(a, b):
    tmp = []
    for i in range(a.shape[0]):
        tmp.append(a[i])
        
    for i in range(b.shape[0]):
        tmp.append(b[i])
    
    tmp = np.array(tmp)
    return tmp

In [23]:
fresh, aged = generate_data('fresh_aged_ieice', 50, 2) # (50, 148, 33) (2, 148, 33)

newdata = connect(fresh, aged) # (52, 148, 33)
check_FPGA = []
for i in range(148):
    for j in range(33):
        if fresh[0, i, j] == 0:
            check_FPGA.append([i,j])
freshaged = delete_920(newdata, check_FPGA) #(52, 148, 33)

flat_freshaged = change_flatten(newdata) # (52, 4884)
new_freshaged = change_flatten(freshaged) # (52 4884)

new_nonzero = delete_zero(flat_freshaged) #(52, 3964前後)
nonzero_freshaged = delete_zero(new_freshaged) #(52, 3964)

In [24]:
def kmeans_plus_plus(data, cluster_num):
    """
    中心点ゲット！
    """
    
    seeds = 50
    np.random.seed(seeds)

    # data_num = data.shape[0]
    feature_num = data.shape[0]

    centers = np.zeros(cluster_num)
    distance = np.zeros(cluster_num)

    probability = np.repeat(1/feature_num, feature_num)
    centers[0] = np.random.choice(data, 1, p=probability)
    distance[0] = np.sum((abs(data - centers[0]))**2)

    for k in range(1, cluster_num):
        np.random.seed(seeds*(k+1))
        probability = data / np.sum(distance)
        probability /= probability.sum() #正規化　probabilities do not sum to 1　と怒られたから

        centers[k] = np.random.choice(data, 1, p=probability)
        distance[k] = np.sum((abs(data - centers[k]))**2)

    return centers


def wrap_k_means(data, k=3):
    cluster = kmeans_plus_plus(data, k) # 初期値kmeans++
    prof, cluster = k_means(data, cluster)
    return prof, cluster


def k_means(data, cluster):
    data = np.array(data, dtype=float)
    prof = np.zeros(len(data)) # 各要素∈dataがどのクラスタに属しているか
    cluster = np.sort(np.array(cluster, dtype=float))
    old_cluster = np.zeros(len(cluster)) # 収束チェック用
    
    conv = True; count = 0
    while conv:
        count += 1
        
        # 割り当て
        for i,d in enumerate(data):
            min_d = float('inf')# てきとうに大きい数
            for j,c in enumerate(cluster):
                dist = (abs(d - c))**2
                if min_d > dist:
                    min_d = dist
                    prof[i] = j # クラスタの割り当て

        # 更新
        for j,c in enumerate(cluster):
            m = 0; n = 0
            for i,p in enumerate(prof):
                if p == j: # もしもそのクラスタに属していたら
                    m += data[i]
                    n += 1
            if m != 0:
                m /= n # mは更新した平均
                old_cluster[j] = cluster[j]
                cluster[j] = m
            
        # 収束チェック
        for i,c in enumerate(cluster):
            if c != old_cluster[i]:
                conv = True
                break
            else:
                conv = False

    return prof, cluster


def choose_center(i, x, tmp_list):
    tmp = []
    for j in range(len(i.center)):
        if j != x:
            tmp_list2 = []
            for k in range(len(i.frequency)):
                if i.cluster[k] == j:
                    tmp_list2.append(i.frequency[k])
            tmp.append(tmp_list2)

    tmp = np.array(tmp)

    nci_list = []
    for k in tmp:
        average_list = []
        for j in tmp_list:
            nci = np.sum(abs(k - j)) / len(k)
            average_list.append(nci)

        average_list = np.array(average_list)
        average_mean = np.mean(average_list)

        nci_list.append(average_mean)

    nci_list = np.array(nci_list)

    a = np.argmin(nci_list)

    return tmp[a]

In [25]:
def main():
    fresh, aged = generate_data('fresh_aged_ieice', 50, 2) # (50, 148, 33) (2, 148, 33)
    freshaged = connect(fresh, aged) # (52, 148, 33)
    flat_freshaged = change_flatten(freshaged) # (52, 4884)
    nonzero_freshaged = delete_zero(flat_freshaged) #(52, 3964前後)

    every_cluster_list = [] # (7, 52)
    for x in range(2, 9):

        fpga_class = []
        for i in range(nonzero_freshaged.shape[0]):
            prof, cluster = wrap_k_means(nonzero_freshaged[i], x)
            #print(f'{i}回目')
            tmp = FPGA(nonzero_freshaged[i], prof, cluster)
            fpga_class.append(tmp)

        # frequency, cluster, center
        x_cluster_list = [] #(52)
        counter = 0
        for i in fpga_class:
            num = len(i.center)
            sci_mean_list = [] #(num)
            for j in range(num):

                tmp_list = [] 
                for k in range(len(i.frequency)):
                    if i.cluster[k] == j:
                        tmp_list.append(i.frequency[k])
                tmp_list = np.array(tmp_list)

                tmp_list2 = choose_center(i, j, tmp_list)

                #シルエットプロット計算
                sci_list = [] 
                for k in tmp_list:
                    oci = np.sum(abs(tmp_list - k)) / (len(tmp_list) - 1)
                    nci = np.sum(abs(tmp_list2 - k)) / len(tmp_list2)
                    sci = (nci - oci) / max(nci, oci)
                    sci_list.append(sci)

                sci_list = np.array(sci_list)
                sci_mean = np.mean(sci_list)
                sci_mean_list.append(sci_mean)

            sci_mean_list = np.array(sci_mean_list)
            sci_mean_mean_list = np.mean(sci_mean_list)
            x_cluster_list.append(sci_mean_mean_list)
            #print(f'{counter}回目')
            counter += 1

        every_cluster_list.append(x_cluster_list)
        print(f'{x}回目')

    f = open('ACN_list_4.binaryfile', 'wb')
    pickle.dump(every_cluster_list, f)
    f.close()

In [None]:
main()

In [32]:
import pickle

f = open('ACN_list_4.binaryfile', 'rb') 
aged_acn_list = pickle.load(f)
f.close()

aged_acn_list = np.array(aged_acn_list)
aged_acn_list= aged_acn_list.T

index_list = np.arange(1, 53)
acn_list = []
average_list = []
status_list = []
for x in aged_acn_list:
    acn = 0
    tmp = 0
    for i in range(len(x)):
        if tmp < x[i]:
            tmp = x[i]
            acn = i + 2

    average_list.append(tmp)
    acn_list.append(acn)

for i in range(50):
    status_list.append('unused')
for i in range(2):
    status_list.append('aged')

print(acn_list)
print(len(acn_list))

test = [average_list, acn_list, status_list]
test = np.array(test)
test = test.T
df = pd.DataFrame(test,
                  columns=['Maximum Average Silhouette value', 'Appropriate Cluster Number', 'Status'],
                  index=index_list)

print(df)
        


EOFError: Ran out of input