아래 코드를 참고해, 수정 사용하였습니다.

https://github.com/gmattl/Thesis_OpenMax

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings(action='ignore')

# 전처리가 완료된 데이터

In [83]:
train = pd.read_csv('./train_done.csv', index_col=0)
test = pd.read_csv('./test_done.csv', index_col=0)
validation = pd.read_csv('./validation_done.csv')

In [4]:
train_text = list(train['full_log'])
train_level = np.array(train['level'])

test_text = list(test['full_log'])

valid_text = list(validation['full_log'])

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 200

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
print(len(tokenizer.word_index))

9171


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(train_text)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print(X.shape)

(472550, 250)


In [7]:
Y = to_categorical(train_level)

# 학습된 모델 불러오기
- simpleRNN
- activation = 'linear'

In [8]:
import tensorflow_addons as tfa
F1_MACRO = tfa.metrics.F1Score(num_classes=7, average='macro')

In [9]:
from tensorflow.keras.optimizers import SGD, Adam

opt = tf.keras.optimizers.Adam(learning_rate=1e-4)

def my_categorical_crossentropy(y_true, y_pred):
    return K.categorical_crossentropy(y_true, y_pred, from_logits=True)

In [10]:
from tensorflow.keras.models import load_model
import keras.losses

model = load_model('model_RNN_02.h5', compile=False)
model.compile(optimizer=opt, loss=my_categorical_crossentropy, metrics=[F1_MACRO])

# 필요한 함수 정의

In [11]:
def softmax(vector):
    e = np.exp(vector)
    return e / e.sum()

# foo = [1, 3, 2]
# foo_result = softmax(foo)
# foo_result

In [45]:
def my_predict(X):
    result = model.predict(X)
    
    # Activation vector
    act = result.copy()
    
    # predicted label
    for n in range(X.shape[0]):
        result[n] = softmax(result[n])
    pred = [np.argmax(i) for i in result]
    
    return act, pred

In [28]:
def data_setting(act, pred, true_labels):
    '''
    올바르게 분류된 train 데이터만 활용합니다.
    act, pred: my_predict 함수의 리턴값
    true_labels : to_categorical(Y)
    '''
    true_labels = [np.argmax(i) for i in Y]

    idx = 0
    mismatch = []

    for i in pred:
        if i != true_labels[idx]:
            mismatch.append(idx)
        idx = idx + 1
        
    # 클래스가 잘못 분류된 인덱스를 삭제
    for wrong_idx in mismatch:
        del pred[wrong_idx]
        del true_labels[wrong_idx]
        
    act = np.delete(act, mismatch, axis=0)
    
    return act, pred, true_labels

In [36]:
import libmr
import scipy.spatial.distance as spd

# cython 설치 후 limbr 설치


def compute_mav_distances(activations, predictions, true_labels):
    """
    각 클래스별 MAV(mean_activations)와 MAV까지의 유클리디안-코사인 거리(eucos_dist)를 구합니다.
    :activations: 각 데이터의 logit
    :predictions: 각 데이터의 예측 클래스
    :true_labels: 각 데이터의 실제 클래스
    """
    
    mean_activations = list()
    eucos_dist = np.zeros(activations.shape[0])

    for cl in range(7): # 클래스 개수
        print('위험도 등급', cl)

        # 현재 클래스에 해당하는 인덱스
        cl_index = [] 
        for i in range(0, len(predictions)): 
            if predictions[i] == cl: 
                cl_index.append(i)

        # 현재 클래스의 인덱스에 해당하는 AV    
        cl_activations = []
        for idx in cl_index:
            act = activations[idx]
            cl_activations.append(act)
        print('올바르게 분류된 데이터 개수', len(cl_index))

        # 현재 클래스의 AV 평균(MAV)
        mean_act = []
        for c in range(7):
            act_sum = 0
            for i in range(len(cl_activations)):
                act_sum += cl_activations[i][c]
            m = act_sum/len(cl_activations)
            mean_act.append(m)
        mean_activations.append(mean_act)

        # MAV까지의 거리
        for col in cl_index:
            eucos_dist[col] = spd.euclidean(mean_act, activations[col])/200. + spd.cosine(mean_act, activations[col])
    
    return mean_activations, eucos_dist

In [38]:
def weibull_tailfitting(eucos_dist, mean_activations, taillength=20):
    """
    각 클래스별 weibull 모델을 생성합니다.
    :eucos_dist: MAV와 데이터 간 거리
    :mean_activations: 클래스별 MAV
    :taillength: weibull 모델 생성시 활용할 상위 거리값 개수
    """

    weibull_model = {}
    
    for cl in range(7):

        # 현재 클래스에 해당하는 인덱스
        label_indexes = [] 
        for i in range(0, len(predictions)): 
            if predictions[i] == cl: 
                label_indexes.append(i)

        eucos_dist_list = []
        for idx in label_indexes:
            eucos = eucos_dist[idx]
            eucos_dist_list.append(eucos)
        
        # weibull_model : 클래스별 eucos_distances, mean_vec, weibull_model
        weibull_model[cl] = {}
        weibull_model[cl]['eucos_distances'] = eucos_dist_list
        weibull_model[cl]['mean_vec'] = mean_activations[cl]
        weibull_model[cl]['weibull_model'] = []

        mr = libmr.MR(verbose=True)
        tailtofit = sorted(weibull_model[cl]['eucos_distances'])[-taillength:]
        mr.fit_high(tailtofit, len(tailtofit))
        weibull_model[cl]['weibull_model'] = mr
    
    return weibull_model

In [47]:
def compute_open_max_probability(openmax_known_score, openmax_unknown_score):
    """
    OpenMax 확률을 구합니다
    :openmax_known_score: 기존 클래스에 대한 weibull 값
    :openmax_unknown_score: unknown 클래스에 대한 weibull 값
    :return: 정규화(softmax)가 완료된 최종 OpenMax 확률
    """

    prob_closed, prob_open, scores = [], [], []

    # 소프트맥스
    for category in range(7):
        scores += [np.exp(openmax_known_score[category])]
    total_denominator = np.sum(np.exp(openmax_known_score)) + np.exp(openmax_unknown_score)

    prob_closed = np.array([scores / total_denominator])
    prob_open = np.array([np.exp(openmax_unknown_score) / total_denominator])

    probs = np.append(prob_closed.tolist(), prob_open)

    assert len(probs) == 8
    return probs

# OpenMax : 준비

In [20]:
act, pred = my_predict(X)

In [29]:
activations, predictions, true_labels = data_setting(act, pred, Y)

In [37]:
mean_activations, eucos_dist = compute_mav_distances(activations, predictions, true_labels)

위험도 등급 0
올바르게 분류된 데이터 개수 333997
위험도 등급 1
올바르게 분류된 데이터 개수 132126
위험도 등급 2
올바르게 분류된 데이터 개수 11
위험도 등급 3
올바르게 분류된 데이터 개수 4128
위험도 등급 4
올바르게 분류된 데이터 개수 10
위험도 등급 5
올바르게 분류된 데이터 개수 2160
위험도 등급 6
올바르게 분류된 데이터 개수 5


In [39]:
weibull_model = weibull_tailfitting(eucos_dist, mean_activations)

# OpenMax : 실행

In [41]:
test_X = tokenizer.texts_to_sequences(test_text)
test_X = pad_sequences(test_X, maxlen=MAX_SEQUENCE_LENGTH)

In [42]:
valid_X = tokenizer.texts_to_sequences(valid_text)
valid_X = pad_sequences(valid_X, maxlen=MAX_SEQUENCE_LENGTH)

In [123]:
act_test, pred_test = my_predict(test_X)
act_valid, pred_valid = my_predict(valid_X)

In [124]:
open_probs = []


for i in range(len(act_test)):
    activation = act_test[i] # 입력 데이터를 하나씩 처리합니다

    alpharank = 1
    num_labels = 7

    # 클래스별 가중치 부여 
    ranked_list = np.argsort(activation)
    ranked_list = ranked_list[::-1]
    alpha_weights = [((alpharank + 1) - i) / float(alpharank) for i in range(1, alpharank + 1)]
    ranked_alpha = np.zeros(num_labels)

    for i in range(0, len(alpha_weights)):
        ranked_alpha[ranked_list[i]] = alpha_weights[i]

    # OpenMax 확률 계산
    openmax_penultimate, openmax_penultimate_unknown = [], []

    for categoryid in range(num_labels):
        label_weibull = weibull_model[categoryid]['weibull_model']  # 클래스별 weibull 모델을 불러옵니다
        label_mav = weibull_model[categoryid]['mean_vec']    # 클래스별 MAV를 불러옵니다
        text_dist = spd.euclidean(label_mav, activation[categoryid])/200. + spd.cosine(label_mav, activation[categoryid])

        weibull_score = label_weibull.w_score(text_dist)

        # activation layer 업데이트
        modified_layer_act = activation[categoryid] * (1 - weibull_score * ranked_alpha[categoryid])
        openmax_penultimate += [modified_layer_act]  
        openmax_penultimate_unknown += [activation[categoryid] - modified_layer_act] 
        
    # 업데이트 된 activation layer로 softmax
    openmax_closedset_logit = np.asarray(openmax_penultimate)
    openmax_openset_logit = np.sum(openmax_penultimate_unknown)

    openmax_probab = compute_open_max_probability(openmax_closedset_logit, openmax_openset_logit)
    openmax_probab

    open_probs.append(list(openmax_probab))


In [132]:
final = [np.argmax(i) for i in open_probs]

In [133]:
final_results = np.reshape(final, (test.shape[0],))

In [134]:
submission = pd.DataFrame()

In [135]:
submission['openmax'] = final_results
submission.value_counts()

level
0        1003349
7         395766
3          12974
5           6510
1            283
2             34
dtype: int64

# compare: threshold

In [96]:
for p in range(len(act_test)):
    act_test[p] = softmax(act_test[p])
act_test

array([[9.9990696e-01, 2.3572773e-05, 1.3501169e-05, ..., 1.5104749e-05,
        1.8109453e-05, 1.6713690e-05],
       [9.9999440e-01, 1.7117198e-06, 7.1506111e-07, ..., 9.8989847e-07,
        1.3949143e-06, 4.5457637e-07],
       [3.6310050e-06, 9.9998266e-01, 2.0094726e-06, ..., 2.8261841e-06,
        3.6840543e-06, 2.8195018e-06],
       ...,
       [6.7222541e-06, 9.9997789e-01, 1.9162346e-06, ..., 2.8178063e-06,
        5.5395553e-06, 3.1374154e-06],
       [9.9999535e-01, 1.1570390e-06, 6.8094238e-07, ..., 9.4036602e-07,
        1.1755834e-06, 3.7139927e-07],
       [9.9999440e-01, 1.7117181e-06, 7.1506025e-07, ..., 9.8989767e-07,
        1.3949129e-06, 4.5457594e-07]], dtype=float32)

In [97]:
test_result = [np.argmax(i) for i in act_test]
test_results = np.reshape(test_result, (test.shape[0], ))
test_results[np.where(np.max(act_test, axis=1) < 0.90)] = 7

In [98]:
submission['threshold'] = test_results

In [99]:
submission.threshold_level.value_counts()

0    1001103
1     395656
3      12869
5       6303
7       2985
Name: threshold_level, dtype: int64

In [100]:
submission

Unnamed: 0_level_0,level,threshold_level
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000000,0,0
1000001,0,0
1000002,7,1
1000003,0,0
1000004,7,1
...,...,...
2418911,0,0
2418912,0,0
2418913,7,1
2418914,0,0
