In [65]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings(action='ignore')

# 전처리된 데이터 읽어오기

In [66]:
train = pd.read_csv('./data/cleaned_train_special.csv', index_col = 0)
test = pd.read_csv('./data/cleaned_test_special.csv', index_col = 0)
validation = pd.read_csv('./data/cleaned_validation_special.csv')
submission = pd.read_csv('./data/sample_submission.csv', index_col = 0)

In [67]:
# 위험도 2에 있는 이상치를 제거
train = train.drop(365348)

In [68]:
train_text=list(train['full_log'])
train_level=np.array(train['level'])

test_text = list(test['full_log'])

valid_text = list(validation['full_log'])

# Tokenizer

In [69]:
from tensorflow.keras.preprocessing.text import Tokenizer

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 200

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
print(len(tokenizer.word_index))

8945


In [70]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(train_text)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print(X.shape)

(472971, 250)


In [71]:
Y = train_level

# 모델 학습

In [72]:
X_train, X_eval, Y_train, Y_eval = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=train_level)

In [73]:
Y_train = to_categorical(Y_train)
Y_eval = to_categorical(Y_eval)

In [74]:
print(X_train.shape)
print(Y_train.shape)
print(X_eval.shape)
print(Y_eval.shape)

(331079, 250)
(331079, 7)
(141892, 250)
(141892, 7)


In [75]:
import tensorflow_addons as tfa
F1_MACRO = tfa.metrics.F1Score(num_classes=7, average='macro')

- 훈련 데이터의 label(target)이 one-hot vector 이면 CategoricalCrossentropy
- 훈련 데이터의 label(target)이 정수이면 SparseCategoricalCrossentropy


- Embedding()의 입력으로 각 단어는 이미 정수 인코딩이 된 상태여야 한다. Embedding()은 정수 인코딩이 된 상태의 입력을 받아서 임베딩을 수행한다.
- ```Embedding(7, 2, input_length=5)```
- (vocab size, embedding dimension, sequence length)

Lr Scheduler는 미리 학습 일정을 정해두고, 그 일정에 따라 학습률을 조정하는 방법입니다. 일반적으로는 warmup이라는 파라미터를 정하고 현재 step이 warmup보다 낮을 경우는 learning rate를 linear하게 증가 시키고, warmup 후에는 각 Lr Scheduler에서 정한 방법대로 learning rate를 update합니다.
https://ai4nlp.tistory.com/8

In [76]:
from tensorflow.keras.optimizers import SGD, Adam

# optimizer = tf.keras.optimizers.SGD(lr=1e-4, momentum=0.9)
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)

In [77]:
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, ModelCheckpoint

# callbacks

cb_checkpoint = ModelCheckpoint('model05_RNN_relu.h5', monitor='val_f1_score', mode='max', verbose=1, save_best_only=True)
cb_early_stopping = EarlyStopping(monitor='val_f1_score', mode='max', patience=10)
cb_reduceLR = ReduceLROnPlateau(monitor='val_f1_score', factor=0.1, patience=3)
# cb_lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-8 * 10**(epoch / 20))

In [79]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

vocab_size = len(tokenizer.word_index) + 1    # padding

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SimpleRNN(32))
model.add(Dense(7, activation='softmax'))

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=[F1_MACRO])
history = model.fit(X_train, Y_train, epochs=30, batch_size=128, validation_split=0.2, callbacks=[cb_early_stopping, cb_checkpoint, cb_reduceLR])

Epoch 1/30

Epoch 00001: val_f1_score did not improve from 0.18840
Epoch 2/30

Epoch 00002: val_f1_score did not improve from 0.18840
Epoch 3/30

Epoch 00003: val_f1_score improved from 0.18840 to 0.20462, saving model to model05_RNN_relu.h5
Epoch 4/30

Epoch 00004: val_f1_score improved from 0.20462 to 0.23639, saving model to model05_RNN_relu.h5
Epoch 5/30

Epoch 00005: val_f1_score improved from 0.23639 to 0.23657, saving model to model05_RNN_relu.h5
Epoch 6/30

Epoch 00006: val_f1_score improved from 0.23657 to 0.23934, saving model to model05_RNN_relu.h5
Epoch 7/30

Epoch 00007: val_f1_score improved from 0.23934 to 0.23947, saving model to model05_RNN_relu.h5
Epoch 8/30

Epoch 00008: val_f1_score improved from 0.23947 to 0.23954, saving model to model05_RNN_relu.h5
Epoch 9/30

Epoch 00009: val_f1_score improved from 0.23954 to 0.23960, saving model to model05_RNN_relu.h5
Epoch 10/30

Epoch 00010: val_f1_score improved from 0.23960 to 0.23972, saving model to model05_RNN_relu.h5
E

KeyboardInterrupt: 

In [None]:
plt.title('F1 SCORE')
plt.plot(history.history['f1_score'], label='training')
plt.plot(history.history['val_f1_score'], label='evaluation')
plt.legend()
plt.show()

In [None]:
plt.title('val loss')
plt.plot(history.history['loss'], label='training')
plt.plot(history.history['val_loss'], label='evaluation')
plt.legend()
plt.show()

In [None]:
model = load_model("model05_RNN.h5")

In [None]:
accr = model.evaluate(X_eval, Y_eval)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

전처리 방식에 따라 ... (모델은 SimpleRNN)
- 현주님 전처리 : 0.891
- 현주님 전처리 & 내 전처리 
    + 특수문자 있음 : 0.939 (아까보다 조금 떨어졌다?)
    + 특수문자 없음 : 0.937

현주님 전처리 + 내 전처리
- 기본 RNN
- 기본 RNN + decay learning rate : 너무 미미하게 바뀐다
- 기본 RNN + ReduceLROnPlateau : **(patience 4: score 0.944)**
- 기본 RNN + ReduceLROnPlateau + Attention All You Need lr scheduler : 자꾸 오류 나서 보류
- 기본 RNN + ReduceLROnPlateau + Exponential Lr Scheduler : 충돌 발생(float() argument must be a string or a number, not 'ExponentialDecay')
- word2vec + RNN 
- CNN

- 기본 RNN + smaller embedding dim(100) : 0.946 (근데 성능은 그 전이 나은 것 같기도. test/validation 예측 결과가 심상치 않음)
- 기본 RNN + smaller embedding dim(100) + SGD : f1 스코어가 엄청 작게 시작하고 (0.07) 너무 조금씩 올라간다
- 기본 RNN + smaller embedding dim(100) + adam(0.0001) : 한 여덟번째 epoch부터 과적합 낌새가 나타남 -> score 0.946, 예측 완전 별로임

- 기본 RNN + Dense + adam(0.0001) + ReduceLROnPlateau

# 예측

In [None]:
seq = tokenizer.texts_to_sequences(test_text)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
pred = model.predict(padded)

In [None]:
temp = [np.argmax(i) for i in pred]

In [None]:
results = np.reshape(temp, (test.shape[0], ))

In [None]:
results[np.where(np.max(pred, axis=1) < 0.9)] = 7

In [None]:
# 기준 0.90
submission['level']=results
submission.value_counts()

In [None]:
# submission.to_csv('./submission/submission_model03_simpleRNN.csv')

# 위험도 7 검증

In [None]:
valid_seq = tokenizer.texts_to_sequences(valid_text)
valid_padded = pad_sequences(valid_seq, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
valid_pred = model.predict(valid_padded)

In [None]:
valid_pred

In [None]:
valid_temp = [np.argmax(i) for i in valid_pred]

In [None]:
valid_results = np.reshape(valid_temp, (3, ))

In [None]:
valid_results[np.where(np.max(valid_pred, axis=1) < 0.9)] = 7

In [None]:
valid_results

In [None]:
valid_pred[0,]

In [None]:
from tensorflow.keras.models import Model

new_model = Model(inputs=model.input, outputs=model.layers[-2].output)
scores = new_model.predict(padded)
scores

In [None]:
scores[0]