###### 2020-11-24 화요일

# 02_Bidirect LSTM + K-fold
 - 구글 collab으로 실행하였습니다

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import pandas as pd
import numpy as np
import re
import json

from tqdm import tqdm_notebook
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold, train_test_split

## 1. 데이터불러오기 & 합치기

In [4]:
feature = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/03_K-Fold_O_[label3개]/data/feature.csv', index_col=[0])
target = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/03_K-Fold_O_[label3개]/data/target.csv', index_col=[0])

word_index_json = open('/content/drive/MyDrive/[final_project]_악플원정대/03_K-Fold_O_[label3개]/data/word_index_vocab.json', 'r').read()
word_index_vocab = json.loads(word_index_json)

## 2. 데이터토큰화 & 패딩

#### 텍스트 음절단위로 자르기

In [5]:
feature_split = feature['댓글'].apply(list).tolist()

#### 정수인덱싱

In [6]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab

feature_sequences = tokenizer.texts_to_sequences(feature_split)

### 패딩

In [7]:
feature_input = pad_sequences(feature_sequences, padding='post', maxlen=400)

## 3. SMOTE를 이용한 라벨 불균형 해소

In [8]:
target.value_counts()

악플여부
0       67435
1       22505
dtype: int64

 - 라벨 0은 446162, 1은 2897, 2는 4993으로 균형이 잡혀있지않아 recall(재현율)이 떨어질 가능성이 높다
 - 그래서 SMOTE를 이용하여 라벨이 1과 2인 경우를 복제하여 라벨학습의 균형을 맞추어 주는 것이다
 - 이것을 `오버샘플링`이라 한다.

In [9]:
from imblearn.over_sampling import SMOTE

In [10]:
smote = SMOTE(random_state=123)
feature_over, target_over = smote.fit_sample(feature_input, target)

In [11]:
pd.Series(target_over).value_counts()

1    67435
0    67435
dtype: int64

In [12]:
target_over = target_over.reshape(-1, 1)

## 4. Bidirect LSTM + K-Fold 구현

In [13]:
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import f1_score

In [14]:
vocab_size = len(word_index_vocab)
embedding_dim = 32
max_length = 400
n_class = 2

In [15]:
# LSTM 모델 생성

def get_model():

    from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

    model = Sequential()
    model.add(Embedding(vocab_size + 1, embedding_dim, input_length=max_length))
    model.add(Bidirectional(LSTM(32, return_sequences=True)))
    model.add(Bidirectional(LSTM(16, return_sequences=False)))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

    return model

In [16]:
# 3-Fold 시행
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)

In [17]:
validation_pred = np.zeros((feature_over.shape[0], 2))


k = 0
for train_idx, val_idx in tqdm_notebook(cv.split(feature_over, target_over)) :

    print('{}번째 Fold 시작' .format(k+1))

    x_train = feature_over[train_idx]
    y_train = target_over[train_idx]

    x_validation = feature_over[val_idx]
    y_validation = target_over[val_idx]

    LSTM = get_model()
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

    LSTM.fit(x_train, y_train,
             callbacks          = [es],
             epochs             = 20,
             batch_size         = 64,
             validation_data    = (x_validation, y_validation))
    
    validation_pred[val_idx, : ] = LSTM.predict(x_validation) 
    print('')

    k += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

1번째 Fold 시작
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 00013: early stopping

2번째 Fold 시작
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 00014: early stopping

3번째 Fold 시작
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 00014: early stopping




In [18]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score,  confusion_matrix

## 5. 모델 평가

In [19]:
def model_evaluate(y_pred, y_test):
    print('accuracy_score : ', accuracy_score(y_pred, y_test))
    print('recall         : ', recall_score(y_pred, y_test))
    print('precision      :', precision_score(y_pred, y_test))
    print('Confusion Matrix : \n', confusion_matrix(y_pred, y_test))

In [22]:
pred = np.argmax(validation_pred, axis=1)

In [23]:
model_evaluate(pred, target_over)

accuracy_score :  0.9020686587083858
recall         :  0.9101208573459787
precision      : 0.8922517980277304
Confusion Matrix : 
 [[61493  7266]
 [ 5942 60169]]


## 6. 최종모델 학습 및 테스트

In [24]:
final_model = get_model()

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
ms = ModelCheckpoint('LSTM_best.h5', monitor='val_loss', mode='min', save_best_only=True)


final_model.fit(feature_over, target_over, 
                callbacks                 = [es, ms],
                epochs                    = 20,
                batch_size                = 64,
                validation_split          = 0.3)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 00008: early stopping


<tensorflow.python.keras.callbacks.History at 0x7faba4d1a710>

In [26]:
loaded_model = load_model('LSTM_best.h5')

In [31]:
def model_test(comment, threshold) :
    comment_list = [list(comment)]
    comment_label = tokenizer.texts_to_sequences(comment_list)
    comment_pad = pad_sequences(comment_label, padding='post', maxlen=400)
    pred = loaded_model.predict(comment_pad)

    y_class = np.argmax(pred, axis=1)

    if y_class == 0 :
        result = '악플이 아닙니다'
    else :
        result = '악플 입니다'

    print(result)
    print('라벨확률 :', pred)

    return pred

In [41]:
comment = ''''''
threshold = 0.5

prob = model_test(comment, threshold)

악플이 아닙니다
라벨확률 : [[0.5645445 0.4354555]]
