## 1. 데이터처리

In [1]:
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import json
import time

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

In [5]:
X_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_train.csv', index_col=[0])
X_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_test.csv', index_col=[0])
y_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_train.csv', index_col=[0])
y_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_test.csv', index_col=[0])

word_index_json = open('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/word_index_vocab.json', 'r').read()
word_index_vocab = json.loads(word_index_json)

In [6]:
X_train_split = X_train['댓글'].apply(list).tolist()
X_test_split = X_test['댓글'].apply(list).tolist()

In [7]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab

X_train_sequences = tokenizer.texts_to_sequences(X_train_split)
X_test_sequences = tokenizer.texts_to_sequences(X_test_split)

In [8]:
train = pad_sequences(X_train_sequences, padding='post', maxlen=400)
test = pad_sequences(X_test_sequences, padding='post', maxlen=400)

In [9]:
y_train.value_counts()

악플여부
0       55965
1       32454
dtype: int64

In [10]:
from imblearn.over_sampling import SMOTE

In [11]:
start = time.time()

smote = SMOTE(random_state=123)
train_over, y_train_over = smote.fit_sample(train, y_train)

print("걸린시간 :", time.time() - start)

걸린시간 : 254.27505135536194


## 2. Support Vector Machine

In [None]:
# train_over = train
# y_train_over = y_train

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

In [13]:
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

In [None]:
param_grid = [
    {'kernel' : ['linear'], 'C' : [10, 30, 100, 300, 1000, 3000, 10000, 30000]},
    {'kernel' : ['rbf'], 'C' : [100, 300, 1000],
    'gamma'   : [0.01, 0.03, 0.1]},
]

SVC_model = SVC()
SVC_grid = GridSearchCV(SVC_model, param_grid=param_grid, cv=cv, scoring='f1', verbose=2)
SVC_grid.fit(train_over, y_train_over)

print('final params', SVC_grid.best_params_)
print('best score', SVC_grid.best_score_)

# final params {'Metric': 'binary', 'boosting': 'dart', 'learning_rate': 0.1, 'max_depth': 15, 'num_iterations': 2000}
# best score 0.8076588623298664


Fitting 5 folds for each of 17 candidates, totalling 85 fits
[CV] C=10, kernel=linear .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print('final params', SVC_grid.best_params_)
print('best score', SVC_grid.best_score_)

final params {'Metric': 'binary', 'boosting': 'dart', 'learning_rate': 0.1, 'max_depth': 10, 'num_iterations': 1400}
best score 0.7963017250403392


In [None]:
y_class = SVC_grid.best_estimator_.predict(test)
y_prob = SVC_grid.best_estimator_.predict_proba(test)[:, 1]


In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

print('recall : ', recall_score(y_class, y_test))
print('precision : ', precision_score(y_class, y_test))
print('f1_score : ', f1_score(y_class, y_test))
print('roc_auc_score : ', roc_auc_score(y_test.values.reshape(1, -1)[0], y_prob))
print(confusion_matrix(y_class, y_test))

recall :  0.6707407407407407
precision :  0.602061170212766
f1_score :  0.6345480028030833
roc_auc_score :  0.8217425769113452
[[12103  2394]
 [ 1778  3622]]


# 3. 모델 저장

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.

array([1, 0, 0, ..., 1, 0, 1])

In [None]:
def model_test(comment):
    loaded_model = load_model('1D_CNN_best.h5')
    comment_list = [list(comment)]
    comment_label = tokenizer.texts_to_sequences(comment_list)

    comment_pad = pad_sequences(comment_label, padding='post', maxlen=400)

    pred = loaded_model.predict(comment_pad)

    if pred < 0.5 :
        result = '악플이 아닙니다'
    else :
        result = '악플입니다'
    
    print(result)
    print('라벨확률 : ', pred)
    
    return result, pred

In [None]:
comment = ''''''

result, prob = model_test(comment)

악플입니다
라벨확률 :  [[0.72722226]]
