###### 2021-01-26


# Very Deep CNN (resNet)

In [1]:
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import json
import time

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. 전처리 완료된 데이터 불러오기

In [4]:
X_train = pd.read_csv('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/data/X_train.csv', index_col=[0])
X_test = pd.read_csv('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/data/X_test.csv', index_col=[0])
y_train = pd.read_csv('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/data/y_train.csv', index_col=[0])
y_test = pd.read_csv('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/data/y_test.csv', index_col=[0])

word_index_json = open('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/data/word_index_vocab.json', 'r').read()
word_index_vocab = json.loads(word_index_json)

In [5]:
X_train_split = X_train['댓글'].apply(list).tolist()
X_test_split = X_test['댓글'].apply(list).tolist()

## 2. 단어 사전을 이용한 토큰화와 패딩화

In [6]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab

X_train_sequences = tokenizer.texts_to_sequences(X_train_split)
X_test_sequences = tokenizer.texts_to_sequences(X_test_split)

In [7]:
train = pad_sequences(X_train_sequences, padding='post', maxlen=400)
test = pad_sequences(X_test_sequences, padding='post', maxlen=400)

In [8]:
# 데이터가 추가되었는지 확인하고 진행하기 바람
# X_train.csv / X_test.csv / y_train.csv / y_test.csv 가 업데이트 되면 런타임을 재실행 한 후 불러와야 적용이됨
# 꼭 확인하고 진행하기를

y_train.value_counts()

악플여부
0       61416
1       30439
dtype: int64

In [9]:
train_over = train
y_train_over = y_train

##4. 1D_CNN

In [10]:
import tensorflow.keras.backend as K
# Mish 활성화 함수
def mish(x):
    return x * K.tanh(K.softplus(x))

In [11]:
# 음절 단어사전의 크기
vocab_size = len(word_index_vocab)
embedding_dim = 128
comment_len = 400

In [13]:
from tensorflow.keras.layers import Input, Embedding, Dense, SpatialDropout1D
from tensorflow.keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D, BatchNormalization
from tensorflow.keras.models import Model

def conv1_input_layer(input, vocab_size, embedding_dim, max_len):
    x = Embedding(vocab_size+1, embedding_dim, input_length=max_len)(input)
    x = SpatialDropout1D(0.5)(x)
    return x


def conv1_layer_v1(input_layer, filters, kernel_size, padding, activation, strides):
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(input_layer)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)
    return x



def conv1_layer_v2(upper_layer, filters, kernel_size, padding, activation, strides):
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(upper_layer)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)

    temp         = upper_layer + x
    return_layer = MaxPool1D()(temp)
    return return_layer



def conv1_layer_v3(upper_layer, filters, kernel_size, padding, activation, strides):
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(upper_layer)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)
    x = Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, strides=strides)(x)
    x = BatchNormalization()(x)

    temp         = upper_layer + x
    return_layer = GlobalAveragePooling1D()(temp)
    return return_layer


def output_layer(upper_layer, activation, node_cnt):
    x = Dense(node_cnt, activation=activation)(upper_layer)
    x = Dense(node_cnt, activation=activation)(x)
    output = Dense(1, activation='sigmoid')(x)
    return output

In [15]:
node_size = 50
activation = mish



input = Input(shape=(comment_len, ))

input_layer     = conv1_input_layer(input, vocab_size, embedding_dim, comment_len)
hidden_layer_01 = conv1_layer_v1(input_layer, node_size, 3, 'same', activation, 1)
hidden_layer_02 = conv1_layer_v2(hidden_layer_01, node_size, 3, 'same', activation, 1)
hidden_layer_03 = conv1_layer_v2(hidden_layer_02, node_size, 3, 'same', activation, 1)
hidden_layer_04 = conv1_layer_v2(hidden_layer_03, node_size, 3, 'same', activation, 1)
hidden_layer_05 = conv1_layer_v2(hidden_layer_04, node_size, 3, 'same', activation, 1)
hidden_layer_06 = conv1_layer_v2(hidden_layer_05, node_size, 3, 'same', activation, 1)
hidden_layer_07 = conv1_layer_v2(hidden_layer_06, node_size, 3, 'same', activation, 1)
hidden_layer_08 = conv1_layer_v2(hidden_layer_07, node_size, 3, 'same', activation, 1)
hidden_layer_09 = conv1_layer_v3(hidden_layer_08, node_size, 3, 'same', activation, 1)
output          = output_layer(hidden_layer_09, 'relu', node_size)

CNN = Model(input, output)
CNN.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [16]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/VDCNN.h5', monitor='val_loss', mode='min', save_best_only=True, restore_best_weights=True)

In [17]:

history = CNN.fit(train_over, y_train_over, 
                    callbacks        = [es, mc],
                    epochs           = 50,
                    batch_size       = 32, 
                    validation_data  = (test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 00014: early stopping


In [18]:
# 저장한 CNN 모델을 불러오고 예측해보자

get_custom_objects().update({ 'mish' : mish })

loaded_model = load_model('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/VDCNN.h5')
y_pred = loaded_model.predict(test)

y_class = y_pred > 0.5


In [19]:
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, accuracy_score

print('accuracy      : ', accuracy_score(y_class, y_test))
print('recall        : ', recall_score(y_class, y_test))
print('precision     : ', precision_score(y_class, y_test))
print('f1_score      : ', f1_score(y_class, y_test))
print('roc_auc_score : ', roc_auc_score(y_test, y_pred))
print(confusion_matrix(y_class, y_test))

accuracy      :  0.9469169134297161
recall        :  0.9455922865013774
precision     :  0.8928339185849915
f1_score      :  0.9184560840189978
roc_auc_score :  0.982819063261032
[[14880   824]
 [  395  6865]]


In [None]:
########### 1. 기본적인 1D-CNN
  # 악플원정대 프로젝트때 사용했던 최종 모델의 성능입니다.
## Threshold=0.5 / SMOTE X / 띄어쓰기 O
accuracy      :  0.944521860303083
recall        :  0.9543844737214903
precision     :  0.876186760306932
f1_score      :  0.9136154054787089
roc_auc_score :  0.9768911568144515
[[14953   952]
 [  322  6737]]


############ 2. Very Deep CNN (resNet)
  # 오늘 새롭게 공부한 CNN모델의 성능입니다.
accuracy      :  0.9469169134297161
recall        :  0.9455922865013774
precision     :  0.8928339185849915
f1_score      :  0.9184560840189978
roc_auc_score :  0.982819063261032
[[14880   824]
 [  395  6865]]


# 확실히.. 더 좋긴하네 ㅎ

# 5. 악플탐지모델 테스트

In [20]:
def model_test(comment):
    loaded_model = load_model('/content/drive/MyDrive/악성댓글_필터링_봇_모델/04_최종모델 [1D-CNN]/VDCNN.h5')
    comment_list = [list(comment)]
    comment_label = tokenizer.texts_to_sequences(comment_list)

    comment_pad = pad_sequences(comment_label, padding='post', maxlen=400)

    pred = loaded_model.predict(comment_pad)

    if pred < 0.5 :
        result = '악플이 아닙니다'
    else :
        result = '악플입니다'
    
    print(result)
    print('라벨확률 : ', pred)
    
    return result, pred

In [26]:
comment = '''개시발롬'''

result, prob = model_test(comment)

악플입니다
라벨확률 :  [[0.86677617]]
