## 1. 데이터처리

In [1]:
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import json
import time

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

In [5]:
X_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_train.csv', index_col=[0])
X_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_test.csv', index_col=[0])
y_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_train.csv', index_col=[0])
y_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_test.csv', index_col=[0])

word_index_json = open('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/word_index_vocab.json', 'r').read()
word_index_vocab = json.loads(word_index_json)

In [6]:
X_train_split = X_train['댓글'].apply(list).tolist()
X_test_split = X_test['댓글'].apply(list).tolist()

In [7]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab

X_train_sequences = tokenizer.texts_to_sequences(X_train_split)
X_test_sequences = tokenizer.texts_to_sequences(X_test_split)

In [8]:
train = pad_sequences(X_train_sequences, padding='post', maxlen=400)
test = pad_sequences(X_test_sequences, padding='post', maxlen=400)

In [9]:
y_train.value_counts()

악플여부
0       55965
1       32454
dtype: int64

In [10]:
from imblearn.over_sampling import SMOTE

In [11]:
start = time.time()

smote = SMOTE(random_state=123)
train_over, y_train_over = smote.fit_sample(train, y_train)

print("걸린시간 :", time.time() - start)

걸린시간 : 261.47045040130615


## 2. lightGBM

In [None]:
# train_over = train
# y_train_over = y_train

In [21]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

In [26]:
NB = BernoulliNB()
NB.fit(train_over, y_train_over)
y_class = NB.predict(test)
y_prob = NB.predict_proba(test)[:,1]


In [27]:
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

print('recall : ', recall_score(y_class, y_test))
print('precision : ', precision_score(y_class, y_test))
print('f1_score : ', f1_score(y_class, y_test))
print('roc_auc_score : ', roc_auc_score(y_test.values.reshape(1, -1)[0], y_prob))
print(confusion_matrix(y_class, y_test))

recall :  0.4259316291961811
precision :  0.16996436032935971
f1_score :  0.24297259311314126
roc_auc_score :  0.5370084702462878
[[12104  6754]
 [ 1864  1383]]


# 3. 모델저장

In [29]:
import pickle
from sklearn.externals import joblib

In [30]:
joblib.dump(NB, 'Bernoulli_naive_bayes.pkl')

['Bernoulli_naive_bayes.pkl']

In [44]:
def model_test(comment):
    loaded_model = joblib.load('Bernoulli_naive_bayes.pkl')
    comment_list = [list(comment)]
    comment_label = tokenizer.texts_to_sequences(comment_list)

    comment_pad = pad_sequences(comment_label, padding='post', maxlen=400)

    pred = loaded_model.predict_proba(comment_pad)[:, 1]

    if pred < 0.5 :
        result = '악플이 아닙니다'
    else :
        result = '악플입니다'
    
    print(result)
    print('라벨확률 : ', pred)
    
    return result, pred

In [50]:
comment = '''병신아'''

result, prob = model_test(comment)

악플이 아닙니다
라벨확률 :  [0.13372898]
