## 1. 데이터처리

In [1]:
import warnings 
warnings.filterwarnings(action='ignore')

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import json
import time

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm_notebook

In [5]:
X_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_train.csv', index_col=[0])
X_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/X_test.csv', index_col=[0])
y_train = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_train.csv', index_col=[0])
y_test = pd.read_csv('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/y_test.csv', index_col=[0])

word_index_json = open('/content/drive/MyDrive/[final_project]_악플원정대/01_K-Fold_X/data/word_index_vocab.json', 'r').read()
word_index_vocab = json.loads(word_index_json)

In [6]:
X_train_split = X_train['댓글'].apply(list).tolist()
X_test_split = X_test['댓글'].apply(list).tolist()

In [7]:
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab

X_train_sequences = tokenizer.texts_to_sequences(X_train_split)
X_test_sequences = tokenizer.texts_to_sequences(X_test_split)

In [8]:
train = pad_sequences(X_train_sequences, padding='post', maxlen=400)
test = pad_sequences(X_test_sequences, padding='post', maxlen=400)

In [9]:
y_train.value_counts()

악플여부
0       55965
1       32454
dtype: int64

In [10]:
from imblearn.over_sampling import SMOTE

In [11]:
start = time.time()

smote = SMOTE(random_state=123)
train_over, y_train_over = smote.fit_sample(train, y_train)

print("걸린시간 :", time.time() - start)

걸린시간 : 257.852915763855


## 2. lightGBM : GridSearchCV

In [13]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

In [None]:
param_grid = {
    'learning_rate'    : [0.1],
    'num_iterations'   : [1400, 2000],
    'max_depth'        : [10, 15],
    'Metric'           : ['binary'],
    'boosting'         : ['dart']  
}

LGBM = LGBMClassifier()
LGBM_grid = GridSearchCV(LGBM, param_grid=param_grid, cv=cv, scoring='f1', verbose=2)
LGBM_grid.fit(train_over, y_train_over)

print('final params', LGBM_grid.best_params_)
print('best score', LGBM_grid.best_score_)

# final params {'Metric': 'binary', 'boosting': 'dart', 'learning_rate': 0.1, 'max_depth': 15, 'num_iterations': 2000}
# best score 0.8076588623298664


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400, total= 4.6min
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s


[CV]  Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400, total= 4.8min
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400 
[CV]  Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400, total= 4.9min
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400 
[CV]  Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400, total= 4.5min
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400 
[CV]  Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=1400, total= 4.9min
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=2000 
[CV]  Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=2000, total= 7.2min
[CV] Metric=binary, boosting=dart, learning_rate=0.1, max_depth=10, num_iterations=2000 
[CV]  Metric=binary, boosting=dart,

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 120.6min finished


final params {'Metric': 'binary', 'boosting': 'dart', 'learning_rate': 0.1, 'max_depth': 15, 'num_iterations': 2000}
best score 0.8076588623298664


In [None]:
print('final params', LGBM_grid.best_params_)
print('best score', LGBM_grid.best_score_)

final params {'Metric': 'binary', 'boosting': 'dart', 'learning_rate': 0.1, 'max_depth': 10, 'num_iterations': 1400}
best score 0.7963017250403392


#3. Test Set 예측

In [12]:
best_params = {'Metric': 'binary', 
               'boosting': 'dart', 
               'learning_rate': 0.1, 
               'max_depth': 10, 
               'num_iterations': 1400}

In [23]:
model = LGBMClassifier(Metric         = 'binary',
                       max_depth      = 10,
                       boosting       = 'dart',
                       learning_rate  = 0.1,
                       num_iterations = 1400)
model.fit(train_over, y_train_over)

LGBMClassifier(Metric='binary', boosting='dart', boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, importance_type='split',
               learning_rate=0.1, max_depth=10, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_iterations=1400, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [26]:
y_class = model.predict(test)
y_prob  = model.predict_proba(test)[:, 1]

In [27]:
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

print('recall : ', recall_score(y_class, y_test))
print('precision : ', precision_score(y_class, y_test))
print('f1_score : ', f1_score(y_class, y_test))
print('roc_auc_score : ', roc_auc_score(y_test.values.reshape(1, -1)[0], y_prob))
print(confusion_matrix(y_class, y_test))

recall :  0.8043478260869565
precision :  0.686616689197493
f1_score :  0.7408340515812505
roc_auc_score :  0.8734402673024568
[[12609  2550]
 [ 1359  5587]]


# 4. 모델저장



In [28]:
import pickle
from sklearn.externals import joblib

In [30]:
joblib.dump(model, 'lightGBM.pkl')

['lightGBM.pkl']

In [31]:
def model_test(comment):
    loaded_model = joblib.load('lightGBM.pkl')
    comment_list = [list(comment)]
    comment_label = tokenizer.texts_to_sequences(comment_list)

    comment_pad = pad_sequences(comment_label, padding='post', maxlen=400)

    pred = loaded_model.predict_proba(comment_pad)[:,1]

    if pred < 0.5 :
        result = '악플이 아닙니다'
    else :
        result = '악플입니다'
    
    print(result)
    print('라벨확률 : ', pred)
    
    return result, pred

In [36]:
comment = '''개새끼야'''

result, prob = model_test(comment)

악플이 아닙니다
라벨확률 :  [0.27372783]
