In [39]:
import os 
import json
import pickle
from datetime import datetime
from copy import deepcopy

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Bidirectional, Attention, Concatenate, Dense
from keras.callbacks import EarlyStopping

from konlpy.tag import Okt

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline

matplotlib_inline.backend_inline.set_matplotlib_formats("png2x") # svg, retina, png2x ...
mpl.style.use("seaborn-v0_8")
mpl.rcParams.update({"figure.constrained_layout.use": True})
sns.set_context("paper") 
sns.set_palette("Set2") 
sns.set_style("whitegrid") 

plt.rc("font", family = "Malgun Gothic")
plt.rcParams["axes.unicode_minus"] = False

# 파일 저장시 파일명의 용이성
def now_time():
    now = datetime.now()
    return now.strftime('%Y%m%d_%H_%M_%S')

### 불러오기

In [17]:
with open("data_prep/data/_2_after_prep/corpus_method_1/sentences.pkl","rb") as f:
    sentences = pickle.load(f)

with open("data_prep/data/_2_after_prep/corpus_method_1/labels.pkl","rb") as f:
    labels = pickle.load(f)

with open("data_prep/data/_2_after_prep/corpus_method_1/sentences_corpus_word_index.json","r") as f:
    sentences_corpus_word_index = json.load(f)

with open("data_prep/data/_2_after_prep/corpus_method_1/label_corpus_word_index.json","r") as f:
    label_corpus_word_index = json.load(f)

### seq and dense 모델 테스트

In [18]:
df = pd.read_csv('data_prep/data/_1_before_prep/all_data_unsmile.csv', index_col=0)

In [19]:
max_text_len = max([len(i) for i in sentences])
text_padded = pad_sequences(sentences, maxlen=max_text_len, padding='post')

In [20]:
sentences_corpus_index_word = {sentences_corpus_word_index[key]:key for key in sentences_corpus_word_index}
label_corpus_index_word = {label_corpus_word_index[key]:key for key in label_corpus_word_index}

In [21]:
padded_all = pad_sequences(sentences, maxlen=max_text_len, padding='post')

In [22]:
df.iloc[5]['문장']

'고향가서 피방가면 동네 부럴 친구들이랑은 뭐 거르는 거 없이 이야기하니까 막 말하게 되더라 당연히 키보드를 치거나 그러지는 않는데 말하는게 많이 거칠어지긴 해 반성해야겠네'

In [23]:
for i in padded_all[5]:
    print(sentences_corpus_index_word[i] , end='')

고향가서피방가면동네부럴친구들이랑은뭐거르는거없이이야기하니까막말하게되더라당연히키보드를치거나그러지는않는데말하는게많이거칠어지긴해반성해야겠네paddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpaddingpadding

In [24]:
padded_all[5]

array([107, 108, 109, 110, 111, 112, 113, 114,  79, 115,  58, 116,  11,
        63, 117, 118, 119, 120, 102, 121, 122, 123, 124, 125, 126, 127,
       128, 129, 102, 130, 131, 132, 133, 134, 135,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [25]:
padded_all.shape

(18742, 77)

In [40]:
X_train, X_test,y_train, y_test = train_test_split(padded_all, df[df.columns[1:]],
                                                                      train_size=15005, random_state=42)

# 이렇게 분리 하는 이유 (기존의 트레인 테스트의 tsv를 하나로 합쳤기 때문)
X_train, X_test = padded_all[:15005], padded_all[15005:] 
y_train, y_test = df[df.columns[1:]][:15005], df[df.columns[1:]][15005:]

In [41]:
# 문제와 정답 라벨 확인
(input_points, input_shape_ ), (output_points, output_shape_)= X_train.shape, y_train.shape
input_shape_, output_shape_ , input_points == output_points

(77, 11, True)

In [42]:
corpus_size = len(sentences_corpus_word_index)
corpus_size

38712

In [43]:
# 모듈화한 모델 불러오기
from model._2_encoder_simple_model import encoder_simple_model

model = encoder_simple_model(input_shape_, corpus_size,output_shape_,)

In [45]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001)

history_model = model.fit(X_train,y_train,
                        validation_split=0.1,
                        epochs=10,
                        batch_size=64,
                        callbacks=[early_stopping],
                        verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [None]:
_ = load_model('checkpoint__\epoch_0004_metrics_0.0783,0.9062,0.2187,0.5793.h5')
model.set_weights(_.get_weights())

In [46]:
pred = model.predict(X_test) > 0.9



In [47]:
from sklearn.metrics import confusion_matrix, classification_report

In [53]:
for i in np.arange(0,1,0.1):
    pred = model.predict(X_test,verbose=0) > i

    number_of_count = 0
    correct = 0
    for j in pred == y_test.to_numpy():
        if False not in j:
            correct +=1
        number_of_count +=1
    print(f'트레쉬 홀드가 {i:0.2f}일때 정확도 {correct/number_of_count:0.2f}',)

트레쉬 홀드가 0.00일때 정확도 0.00
트레쉬 홀드가 0.10일때 정확도 0.40
트레쉬 홀드가 0.20일때 정확도 0.45
트레쉬 홀드가 0.30일때 정확도 0.46
트레쉬 홀드가 0.40일때 정확도 0.46
트레쉬 홀드가 0.50일때 정확도 0.44
트레쉬 홀드가 0.60일때 정확도 0.43
트레쉬 홀드가 0.70일때 정확도 0.40
트레쉬 홀드가 0.80일때 정확도 0.37
트레쉬 홀드가 0.90일때 정확도 0.31


```py
for i in np.arange(0.2,0.31,0.001):
    pred = model.predict(X_test,verbose=0) > 0.2

    number_of_count = 0
    correct = 0
    for j in pred == y_test.to_numpy():
        if False not in j:
            correct +=1
        number_of_count +=1
    print(f'트레쉬 홀드가 {i}일때 정확도',correct/number_of_count)

# incoder 와 Dense 층만 있는 모델의 정확도
# 트레쉬 홀드가 0.00일때 정확도 0.00
# 트레쉬 홀드가 0.10일때 정확도 0.40
# 트레쉬 홀드가 0.20일때 정확도 0.45
# 트레쉬 홀드가 0.30일때 정확도 0.46
# 트레쉬 홀드가 0.40일때 정확도 0.46
# 트레쉬 홀드가 0.50일때 정확도 0.44
# 트레쉬 홀드가 0.60일때 정확도 0.43
# 트레쉬 홀드가 0.70일때 정확도 0.40
# 트레쉬 홀드가 0.80일때 정확도 0.37
# 트레쉬 홀드가 0.90일때 정확도 0.31
```

- 정확도를 50를 넘기지 못한다.

In [None]:
print(classification_report(y_test, pred,target_names=df.columns[1:]));

              precision    recall  f1-score   support

       여성/가족       0.79      0.50      0.61       601
          남성       0.83      0.71      0.76       492
        성소수자       0.86      0.70      0.77       437
       인종/국적       0.78      0.64      0.70       662
          연령       0.90      0.53      0.67       221
          지역       0.91      0.76      0.83       403
          종교       0.83      0.78      0.80       467
       기타 혐오       0.67      0.15      0.24       208
       악플/욕설       0.44      0.45      0.44      1116
       clean       0.59      0.60      0.59      1410
        개인지칭       0.00      0.00      0.00       112

   micro avg       0.68      0.58      0.62      6129
   macro avg       0.69      0.53      0.58      6129
weighted avg       0.68      0.58      0.62      6129
 samples avg       0.56      0.59      0.57      6129



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
