###### 2020-11-23 월요일

# 02_Bidirect LSTM
 - 구글 collab으로 실행하였습니다

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import pandas as pd
import numpy as np
import re
import json
import time

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

## 1. 데이터불러오기 & 합치기

In [5]:
X_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_train.csv', index_col=[0])
X_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_test.csv', index_col=[0])
y_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_train.csv', index_col=[0])
y_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_test.csv', index_col=[0])

word_index_json = open('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/word_index_vocab.json', 'r').read()
word_index_vocab = json.loads(word_index_json)

## 2. 데이터토큰화 & 패딩

#### 텍스트 음절단위로 자르기

In [6]:
X_train_split = X_train['댓글'].apply(list).tolist()
X_test_split = X_test['댓글'].apply(list).tolist()

#### 정수인덱싱

In [7]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab

X_train_sequences = tokenizer.texts_to_sequences(X_train_split)
X_test_sequences = tokenizer.texts_to_sequences(X_test_split)

### 패딩

In [8]:
train = pad_sequences(X_train_sequences, padding='post', maxlen=400)
test = pad_sequences(X_test_sequences, padding='post', maxlen=400)

## 3. SMOTE를 이용한 라벨 불균형 해소

In [9]:
y_train.value_counts()

악플여부
0       55644
1       23944
dtype: int64

 - 라벨 0은 54741, 1은 21807으로 균형이 잡혀있지않아 recall(재현율)이 떨어질 가능성이 높다
 - 그래서 SMOTE를 이용하여 라벨이 1인 경우를 복제하여 라벨학습의 균형을 맞추어 주는 것이다
 - 이것을 `오버샘플링`이라 한다.

In [10]:
from imblearn.over_sampling import SMOTE

In [11]:
start = time.time()

smote = SMOTE(random_state=123)
train_over, y_train_over = smote.fit_sample(train, y_train)

print("걸린시간 :", time.time() - start)

걸린시간 : 239.83352994918823


In [12]:
pd.Series(y_train_over).value_counts()

1    55644
0    55644
dtype: int64

In [None]:
# train_over = train
# y_train_over = y_train.values

## 4. Bidirect LSTM 구현

In [13]:
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import f1_score

In [14]:
# 음절 단어사전의 크기
vocab_size = len(word_index_vocab)
embedding_dim = 32
comment_len = 400

In [19]:
model = Sequential()
model.add(Embedding(vocab_size+1, embedding_dim, input_length=comment_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Bidirectional(LSTM(32, return_sequences=False)))

model.add(Dropout(0.2))
model.add(Dense(32, 'relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [20]:
# 성능의 변화가 없을때 멈추는 기능
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

# 지금까지 가장 좋은 성능이 나왔을때, 노드의 가중치를 저장하는 함수
mc = ModelCheckpoint('LSTM_best.h5', monitor= 'val_loss', mode='min', save_best_only=True)

In [21]:
y_train_over = y_train_over.reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

In [22]:
history = model.fit(train_over, y_train_over, 
                    callbacks        = [es, mc],
                    epochs           = 20,
                    batch_size       = 32, 
                    validation_data = (test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 00004: early stopping


## 5. Test Set 예측

In [None]:
# 저장한 LSTM 모델을 불러오고 예측해보자

loaded_model = load_model('LSTM_best.h5')
y_pred = loaded_model.predict(test)

y_class = y_pred > 0.5
confusion_matrix(y_class, y_test)

array([[12836,  1182],
       [  941,  4755]])

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score

print('recall : ', recall_score(y_class, y_test))
print('precision : ', precision_score(y_class, y_test))
print('f1_score : ', f1_score(y_class, y_test))
print('roc_auc_score : ', roc_auc_score(y_test, y_pred))

recall :  0.8347963483146067
precision :  0.8009095502779181
f1_score :  0.817501934152841
roc_auc_score :  0.938645487521959


## 5. 모델 테스트

In [None]:
def model_test(comment):
    comment_list = [list(comment)]
    comment_label = tokenizer.texts_to_sequences(comment_list)

    comment_pad = pad_sequences(comment_label, padding='post', maxlen=400)

    pred = loaded_model.predict(comment_pad)

    if pred < 0.5 :
        result = '악플이 아닙니다'
    else :
        result = '악플입니다'
    
    print(result)
    print('라벨확률 : ', pred)
    
    return result, pred

In [None]:
comment = ''''''

result, prob = model_test(comment)

악플입니다
라벨확률 :  [[0.9998801]]
