In [2]:
import pandas as pd
import numpy as np

civious_df = pd.read_csv('./data/Dataset.csv.txt', delimiter='\t')

#### **데이터 전처리**

1. puntuation 제거 (숫자로 비속어를 표현하는 경우도 있기때문에 숫자는 남겨둠)

null 값의 label을 직접 입력해줌

하는 김에 label도 int로 바꿔줌

In [3]:
for i in civious_df[civious_df['lable'].isnull()].index :
    textNlabel = civious_df.loc[i, 'content'].split('\t')
    civious_df.loc[i, 'content'] = textNlabel[0]
    civious_df.loc[i, 'lable'] = int(textNlabel[1])

In [4]:
civious_df['content'] = civious_df['content'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 0-9]","", regex=True)
# civious_df['content'] = civious_df['content'].str.replace("[:punct:]","", regex=True)

In [5]:
civious_df['lable'] = civious_df['lable'].apply(lambda x : int(x))

In [6]:
civious_df['lable'] = civious_df['lable'].replace([1,0],[0,1])

In [7]:
civious_df.head(5)

Unnamed: 0,content,lable
0,이종석 한효주 나오는 드라마 이후로 드라마 안봤다 2년전인가 좀 신선했었지 근데 이...,1
1,씨바알노무노무 술프노 오늘 저녁은 꽂등심이다ㅠㅜ,1
2,짱깨 꺼라ㅡ패쓰,1
3,그들의 사생활 고인이된 설리를 위해서라도 모두 조용하길 지금 누굴 탓한다고 무슨소...,0
4,아무리 법이 뭣같아도 무슨 자격으로 개인의 신상정보를 불특정 다수에게 공개하는지 도...,0


2. 중복 제거

In [8]:
civious_df.nunique()

content    9982
lable         2
dtype: int64

In [9]:
civious_df.drop_duplicates(subset=['content'], inplace=True, ignore_index=True)

In [10]:
civious_df.shape

(9982, 2)

3. puntuation 제거 후 빈 문자열이 된 데이터가 있는지 확인

In [11]:
civious_df['content'].replace('', np.nan, inplace=True)

In [12]:
civious_df.shape

(9982, 2)

In [13]:
print(civious_df.isnull().sum())

content    0
lable      0
dtype: int64


content column에 빈 문자열은 없음!

#### **토큰화**

In [14]:
stopwords = []

#stopwords text file 중 '씨, 문제' 제거
stopwords_file = open('./data/korean_stopwords.txt', 'r', encoding='UTF8')
stopwords_lines = stopwords_file.readlines()
for line in stopwords_lines :
    stopwords.append(line.split('\t')[0])

In [15]:
import jpype #이거 안쓰면 에러남!!!!
from konlpy.tag import Okt

okt = Okt()
token_list = []

In [16]:
toknized_df = pd.DataFrame(columns=['text', 'label'])

In [None]:
for i in civious_df.index :
    sentence = civious_df.loc[i, 'content']
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    toknized_df.loc[i, 'text'] = temp_X

In [None]:
toknized_df['label'] = civious_df['lable']

In [None]:
toknized_df.head(5)

#### **인코딩**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(toknized_df['text'])

In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if (value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

3회 미만으로 등장하는 단어가 8.9퍼센트를 차지한다.

In [None]:
vocab_size = total_cnt - rare_cnt + 1
print('단어 집합의 크기 :',vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(toknized_df['text'])
toknized_df['encoding'] = tokenizer.texts_to_sequences(toknized_df['text'])

In [None]:
toknized_df.head(5)

문장을 구성하는 모든 단어가 희귀 단어여서 빈 encoding sequence를 생성하는 경우를 제거

In [None]:
drop_index = [index for index, sentence in enumerate(toknized_df['encoding']) if len(sentence) < 1]

In [None]:
len(drop_index)

In [None]:
toknized_df.drop(index=drop_index, axis=0, inplace=True)
# toknized_df = np.delete(toknized_df, drop_index, axis=0)
toknized_df.reset_index(inplace=True, drop=True)

In [None]:
toknized_df

#### **패딩**
샘플 길이 맞춰주는 과정

In [None]:
import matplotlib.pyplot as plt

print('시퀀스 최대 길이 :',max(len(l) for l in toknized_df['encoding']))
print('시퀀스 평균 길이 :',sum(map(len, toknized_df['encoding']))/len(toknized_df['encoding']))
plt.hist([len(s) for s in toknized_df['encoding']], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

최적의 시퀀스 길이(대부분의 텍스트가 내용이 잘리지 않도록 할 수 있는 최적의 max_len의 값)는?

In [None]:
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

In [None]:
max_len = 40
below_threshold_len(max_len, toknized_df['encoding'])

전체 데이터 중 약 97%의 샘플이 40 이하의 길이를 가짐. 모든 시퀀스 길이를 40으로 맞추기로 결정.

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoded_text = pad_sequences(toknized_df['encoding'], maxlen = max_len)

In [None]:
encoded_df = pd.DataFrame(data=encoded_text)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(encoded_df, toknized_df['label'], random_state=42)

### **Deep learning**

2. LSTM

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np
# fix random seed for reproducibility 
seed = 7 
np.random.seed(seed)

def create_model(dropout_rate=0.01):
#     lstm_model = Sequential()
#     lstm_model.add(Embedding(vocab_size, 100))
#     lstm_model.add(Bidirectional(LSTM(128, return_sequences=True)))
#     lstm_model.add(Dropout(dropout_rate))
#     lstm_model.add(Bidirectional(LSTM(25, return_sequences=True)))
#     lstm_model.add(Dense(1, activation='sigmoid'))
    lstm_model = Sequential()
    lstm_model.add(Embedding(vocab_size, 100))
    lstm_model.add(LSTM(128))
    lstm_model.add(Dense(1, activation='sigmoid'))
    lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return lstm_model

model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=100, verbose=0)

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

results = cross_val_score(model, X_train, y_train, cv=kfold)

In [None]:
results

In [None]:
import numpy
def create_model():
    lstm_model = Sequential()
    lstm_model.add(Embedding(vocab_size, 100))
    lstm_model.add(LSTM(128))
    lstm_model.add(Dense(1, activation='sigmoid'))
    lstm_model.compile(loss='binary_crossentropy', metrics=['accuracy'])
    return lstm_model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)
# define the grid search parameters
batch_size = [50, 100, 300]
epochs = [10, 20, 30]
optimizer = ['Nadam', 'Adam', 'RMSProp']
param_grid = dict(batch_size=batch_size, epochs=epochs, optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [65]:
search_params = {
    "batch_size": [20, 30, 40],
    "time_steps": [30, 60, 90], 
    "lr": [0.01, 0.001, 0.0001],
    "epochs": [30, 50, 70]
}

def eval_model():
    lstm_model = Sequential()
    lstm_model.add(Embedding(vocab_size, 100))
    lstm_model.add(Bidirectional(LSTM(128, return_sequences=True)))
    lstm_model.add(Dropout(dropout_rate))
    lstm_model.add(Bidirectional(LSTM(25, return_sequences=True)))
    lstm_model.add(Dense(1, activation='sigmoid'))
    return lstm_model

def get_all_combinations(params):
    all_names = params.keys()
    combinations = it.product(*(params[name] for name in all_names))
    return list(combinations)

def run_search(mat, params):
    param_combs = get_all_combinations(params) # list of tuples
    logging.info("Total combinations to try = {}".format(len(param_combs)))
    for i, combination in enumerate(param_combs):
        logging.info("Trying combo no. {} {}".format(i, combination))
        eval_model(mat, combination, i)

run_search(X_train, search_params)

NameError: name 'it' is not defined

In [49]:
search_params = {
    'batch_size': [348,256,200,128],
    'lstm1_nodes': list(range(70, 150, 10)),
    'lstm1_dropouts': [0.1,0.2,0.4,0.6],
    'learning_rate': [0.001,0.01,0.1,0.0001],
    "epochs": list(range(30, 50, 10)),
    "optimizer": ['Nadam', 'Adam', 'RMSProp'],
}

In [52]:
grid_param_LSTM = {
    'batch_size': [348,256,200,128],
    'epochs': [15,30],   
    'learning_rate':[0.001,0.01,0.1,0.0001],
    'optimizer': ['Nadam', 'Adam', 'RMSProp'],
    'loss': ['logcosh', 'mae', 'mse', 'hinge','squared_hinge'],
    'activation': ['relu', 'linear','sigmoid','hard_sigmoid', 'tanh'],
    'dropout_rate':[0.1,0.2,0.4,0.6]
}

In [50]:
lstm_model = Sequential()
lstm_model.add(Embedding(vocab_size, 100))
lstm_model.add(LSTM(128))
lstm_model.add(Dense(1, activation='sigmoid'))

In [63]:
from keras.wrappers.scikit_learn import KerasClassifier
# Function to create model, required for KerasClassifier

def getModel(optimizer):
    lstm_model = Sequential()
    lstm_model.add(Embedding(vocab_size, 100))
    lstm_model.add(LSTM(128))
    lstm_model.add(Dense(1, activation='sigmoid'))
    return lstm_model

grid_param_LSTM = {
    'batch_size': [348,256,200,128],
    'epochs': [15,30],   
    'learning_rate':[0.001,0.01,0.1,0.0001],
    'optimizer': ['Nadam', 'Adam', 'RMSProp'],
    'loss': ['logcosh', 'mae', 'mse', 'hinge','squared_hinge'],
}

Kmodel = KerasClassifier(build_fn=getModel, verbose=1)
grid = GridSearchCV(estimator=Kmodel, param_grid=grid_param_LSTM, scoring='accuracy', n_jobs=-1, refit='boolean')
grid_result = grid.fit(X_train, y_train)

ValueError: learning_rate is not a legal parameter

In [56]:
from sklearn.model_selection import GridSearchCV
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=lstm_model, param_grid=param_grid, n_jobs=-1, scoring="accuracy")
grid_result = grid.fit(X_train, y_train)

TypeError: Cannot clone object '<tensorflow.python.keras.engine.sequential.Sequential object at 0x000002201888FB20>' (type <class 'tensorflow.python.keras.engine.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

In [55]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

rf_random_search = RandomizedSearchCV(
        estimator=lstm_model,
        param_distributions = grid_param_LSTM,
        n_iter = 20,
        cv = 5,
        scoring="accuracy")

rf_random_search.fit(X_train, y_train)

TypeError: Cannot clone object '<tensorflow.python.keras.engine.sequential.Sequential object at 0x000002201888FB20>' (type <class 'tensorflow.python.keras.engine.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

In [40]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc_lstm = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [41]:
lstm_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
lstm_history = lstm_model.fit(X_train, y_train, epochs=15, callbacks=[es, mc_lstm], batch_size=60, validation_split=0.2)

Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.79264, saving model to best_model.h5
Epoch 2/15

Epoch 00002: val_acc improved from 0.79264 to 0.81605, saving model to best_model.h5
Epoch 3/15

Epoch 00003: val_acc improved from 0.81605 to 0.83077, saving model to best_model.h5
Epoch 4/15

Epoch 00004: val_acc did not improve from 0.83077
Epoch 5/15

Epoch 00005: val_acc did not improve from 0.83077
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.83077
Epoch 00006: early stopping


In [42]:
lstm_loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (lstm_loaded_model.evaluate(X_valid, y_valid)[1]))


 테스트 정확도: 0.8419


In [46]:
def preprocess_text(new_sentence):
    new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    return pad_new

In [47]:
def lstm_predict(new_sentence):
  processed_text = preprocess_text(new_sentence)
  score = float(lstm_loaded_model.predict(processed_text)) # 예측
  if(score > 0.5):
    print("{:.2f}% 확률로 악의적인 글입니다.\n".format(score * 100))
  else:
    print("{:.2f}% 확률로 악의적인 글이 아닙니다.\n".format((1 - score) * 100))

In [48]:
lstm_predict('못된 넘들...남의 고통을 즐겼던 넘들..이젠 마땅한 처벌을 받아야지')

94.76% 확률로 악의적인 글이 아닙니다.



In [49]:
lstm_predict("개소리야 니가 빨갱이를 옹호하고 드루킹을 ㅇㅇ짓이라고 말못해서 삐진거야 빨갱아")

99.97% 확률로 악의적인 글입니다.



In [50]:
lstm_predict('이 영화 핵노잼 ㅠㅠ')

99.37% 확률로 악의적인 글이 아닙니다.



In [51]:
lstm_predict('이딴게 영화냐 ㅉㅉ')

79.05% 확률로 악의적인 글입니다.

