In [1]:
from jamo import h2j, j2hcj
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
vocab = ['ㄷ', 'ㅏ', 'ㄴ', 'ㄱ', 'ㅜ', 'ㅎ', 'ㄹ', 'ㅇ', 'ㅂ', 'ㅓ', 'ㅈ', 'ㅣ', ' ',
 'ㅡ', 'ㅢ', 'ㅁ', 'ㅗ', 'ㅅ', 'ㅔ', 'ㅕ', 'ㅑ', ';', 'B', 'J', '.', 'P', 'G',
 'ㄸ', 'ㅟ', 'ㅃ', 'ㅌ', '[', '1', ':', '8', '2', '3', '0', ']', 'V', 'L',
 'I', 'E', 'ㅋ', 'ㅖ', '(', 'ㅠ', ')', '5', 'ㅝ', 'ㅐ', 'ㅆ', "'", 'ㅀ', 'ㅊ',
 't', 'x', 'ㅙ', 'ㅚ', 'ㅉ', 'ㅍ', 'ㅄ', '?', 'g', 'i', 'f', 'ㅛ', '6', '7', '☀',
 'ㄲ', 'v', 's', 'ㅘ', '!', 'ㄶ', 'p', 'c', 'ㄼ', '\u3000', 'k', '4', '9', ',',
 'ㅞ', 'ㅒ', '“', '”', 'N', '‘', '’', 'T', 'O', 'a', 'r', 'm', 'S', '+', 'o', 'd',
 'l', 'u', '·', '~', '/', 'ㄻ', '^', 'ㄺ', 'e', 'n', 'A', '-', 'D', '&', 'C',
 'F', 'j', 'M', 'K', '"', '_', 'Z', 'X', 'U', '…', 'ㄾ', 'w', '=', 'z',
 '>', '<', 'b', 'H', '@', '*', 'W', 'y', 'h', 'R', '%', 'ㄽ', '．',
 'ｊ', 'ｐ', 'ｇ', 'ㄵ', '{', '}', 'q', 'Y', 'Q',
 '$', 'ㄿ', '？', 'ㆍ', 'ㄳ', '⋅', '—']

vocab_dict = {c: i for i, c in enumerate(vocab, 1)}

In [3]:
UNK = 0
PAD = len(vocab_dict) + 1

def preprocessing(char_list):
    ret = [vocab_dict[char] if char in vocab_dict else UNK for char in char_list]
    if len(ret) <= 100:
        ret += [PAD] * (100 - len(ret))
    else:
        ret = ret[:100]
    return ret

In [4]:
df_normals = pd.read_csv('models/normals.txt', sep='\t', names=['text', 'label'])
df_swears = pd.read_csv('models/swears.txt', sep='\t', names=['text', 'label'])

In [5]:
df_normals['text'] = df_normals['text'].apply(lambda x: j2hcj(h2j(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: j2hcj(h2j(x)))

In [6]:
df_normals['text'] = df_normals['text'].apply(lambda x: preprocessing(list(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: preprocessing(list(x)))

In [7]:
df_normals_sampled = df_normals.sample(n=len(df_swears), random_state=2020)

In [8]:
df_total = pd.concat([df_normals_sampled, df_swears], ignore_index=True)
df_total

Unnamed: 0,text,label
0,"[61, 45, 31, 10, 11, 12, 7, 4, 10, 18, 4, 2, 3...",0
1,"[8, 12, 6, 51, 13, 16, 17, 18, 6, 2, 3, 21, 63...",0
2,"[8, 45, 3, 14, 30, 2, 47, 47, 47, 47, 159, 159...",0
3,"[6, 10, 4, 13, 18, 10, 8, 20, 9, 12, 13, 4, 14...",0
4,"[61, 12, 3, 8, 67, 3, 14, 3, 13, 61, 14, 7, 17...",0
...,...,...
4951,"[18, 12, 3, 18, 19, 4, 45, 13, 8, 12, 3, 11, 1...",1
4952,"[4, 2, 16, 4, 12, 4, 2, 11, 12, 4, 17, 13, 6, ...",1
4953,"[8, 12, 18, 12, 9, 2, 7, 18, 51, 71, 12, 1, 14...",1
4954,"[6, 74, 4, 11, 12, 3, 11, 2, 13, 11, 17, 3, 3,...",1


In [9]:
df_train, df_test = train_test_split(df_total, test_size=0.2, stratify=df_total['label'])

In [10]:
df_train

Unnamed: 0,text,label
1538,"[61, 2, 16, 5, 3, 13, 4, 51, 18, 2, 4, 12, 4, ...",0
3200,"[11, 17, 11, 6, 14, 8, 11, 17, 11, 44, 44, 159...",1
2751,"[4, 2, 71, 14, 16, 13, 4, 29, 11, 12, 13, 61, ...",1
4627,"[11, 29, 18, 51, 71, 12, 11, 12, 9, 13, 8, 12,...",1
3034,"[63, 63, 63, 13, 34, 13, 8, 2, 104, 13, 9, 17,...",1
...,...,...
1998,"[16, 12, 3, 11, 5, 84, 13, 18, 10, 8, 5, 7, 18...",0
1841,"[3, 47, 9, 12, 13, 18, 14, 44, 2, 8, 5, 31, 10...",0
1966,"[60, 2, 7, 3, 20, 13, 4, 17, 8, 11, 5, 16, 20,...",0
108,"[4, 14, 3, 7, 51, 13, 36, 104, 37, 11, 5, 13, ...",0


In [11]:
xtrain, ytrain = np.vstack(df_train['text']), tf.one_hot(np.array(df_train['label']), depth=2)
xtest, ytest = np.vstack(df_test['text']), tf.one_hot(np.array(df_test['label']), depth=2)

In [12]:
train_ds = tf.data.Dataset.from_tensor_slices((xtrain,ytrain)).batch(32)
test_ds =tf.data.Dataset.from_tensor_slices((xtest,ytest)).batch(32)

In [13]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=64, input_length=100)
        self.conv1 = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(100,64))
        self.conv2 = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(96,32))
        self.conv3 = tf.keras.layers.Conv1D(8, 5, activation='relu', input_shape=(92,16))
        self.conv4 = tf.keras.layers.Conv1D(4, 5, activation='relu', input_shape=(88, 8))
        self.flat = tf.keras.layers.Flatten(input_shape=(84, 4))
        self.dense = tf.keras.layers.Dense(128, activation='relu')
        self.classifier = tf.keras.layers.Dense(2, activation='softmax')
    
    def call(self, x):
        x = self.emb(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flat(x)
        x = self.dense(x)
        return self.classifier(x)

model = Model()

In [14]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

In [15]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [16]:
@tf.function
def train_step(documents, labels):
    with tf.GradientTape() as tape:
        predictions = model(documents)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [17]:
@tf.function
def test_step(documents, labels):
    predictions = model(documents)
    loss = loss_object(labels, predictions)
    
    test_loss(loss)
    test_accuracy(labels, predictions)

In [18]:
EPOCHS = 100

for epoch in range(EPOCHS):
    for documents, labels in train_ds:
        train_step(documents, labels)
    for documents, labels in test_ds:
        test_step(documents, labels)
    
    if epoch % 10 == 0:
        template = "[EPOCH {}/{}], LOSS: {}, ACCURACY: {}, TEST_LOSS: {}, TEST_ACCURACY: {}"
        print(template.format(
            epoch+1,
            EPOCHS,
            train_loss.result(),
            train_accuracy.result()*100,
            test_loss.result(),
            test_accuracy.result()*100
        ))

NameError: in user code:

    <ipython-input-16-a4be6247b1aa>:4 train_step  *
        predictions = model(documents)
    <ipython-input-13-2d570c75f8cc>:22 call  *
        query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(query_value_attn)

    NameError: name 'query_value_attn' is not defined


In [41]:
def predict(text=None):
    if text is None:
        text = input()
    x = j2hcj(h2j(text))
    x = preprocessing(list(x))
    prediction = model.predict(np.array([x]))
    if prediction[0][0] > prediction[0][1]:
        print('정상입니다.')
    else:
        print('욕입니다.')
    print(prediction)

In [42]:
predict()

 da


정상입니다.
[[0.99047613 0.00952379]]


In [44]:
for document in pd.read_csv('models/swears.txt', sep='\t', names=['text', 'label'])['text']:
    print(document)
    print(predict(document))

[연뮤] ㄴㅅㄱㅁㅇ) 비스티가 벌써 끝났다니... 약ㅅㅍ
정상입니다.
[[9.9999964e-01 3.8735953e-07]]
None
요즘 새끼들은 신비아파트같은거만 봐서 그 꼬라지지
욕입니다.
[[0. 1.]]
None
하이스코어 걸 대쉬 히다카 개꼴려
욕입니다.
[[0. 1.]]
None
성질 뻗쳐 유인촌 운지 노무현
정상입니다.
[[1.0000000e+00 1.4019319e-10]]
None
걍 디키는 위키 자체가 병신이라 뭐가 문젠지 따질게 없다
욕입니다.
[[0. 1.]]
None
미친 갑자기 과읶선출을?
욕입니다.
[[0. 1.]]
None
근데 근육 이새끼는 뭐했길래 운영자 된거임?
욕입니다.
[[0. 1.]]
None
그냥 디시위키 이번 일은 싹 다 병신임 싹 다
욕입니다.
[[2.5675018e-35 1.0000000e+00]]
None
근데 근육은 왜 시발 한없이 운영자임?
욕입니다.
[[0. 1.]]
None
시발 근육 말하는 뽄새좀봐
욕입니다.
[[0. 1.]]
None
자삶멋하겠네시발
욕입니다.
[[0. 1.]]
None
나도 지원금 달라고 ㅅㅂ
욕입니다.
[[1.09440655e-33 1.00000000e+00]]
None
출근 씨발 ㅋ
욕입니다.
[[0. 1.]]
None
야 ㅅㅂ 매장에 진짜 귀신있나바
욕입니다.
[[0. 1.]]
None
ㅅㅂ 우리매장에 귀신있나바 어케해
욕입니다.
[[0. 1.]]
None
??:삐앰들은 니새끼 까는게 다 꾹앰같노?
욕입니다.
[[5.5690223e-24 1.0000000e+00]]
None
블아봊은 패면서 저주파 빠는게 씹슴창다워서 웃기긔
욕입니다.
[[2.3454088e-21 1.0000000e+00]]
None
아 유입창년 헌갤가서 글싸지마 미친년아
욕입니다.
[[0. 1.]]
None
댓츠핫이 밴드 붙일 정도면 상대는 좆창났겠네
욕입니다.
[[0. 1.]]
None
북짱이 마지막남은 씹소퀴 보루잖긔
욕입니다.
[[0. 1.]]
None
블아필봊 호모 정해준다
정상입니다.
[[9.9999