In [1]:
from jamo import h2j, j2hcj
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
from itertools import chain

[_ for _ in chain.from_iterable(([1,2,3], [4,5,6]))]

[1, 2, 3, 4, 5, 6]

In [3]:
vocab = ['ㄷ', 'ㅏ', 'ㄴ', 'ㄱ', 'ㅜ', 'ㅎ', 'ㄹ', 'ㅇ', 'ㅂ', 'ㅓ', 'ㅈ', 'ㅣ', ' ',
 'ㅡ', 'ㅢ', 'ㅁ', 'ㅗ', 'ㅅ', 'ㅔ', 'ㅕ', 'ㅑ', ';', 'B', 'J', '.', 'P', 'G',
 'ㄸ', 'ㅟ', 'ㅃ', 'ㅌ', '[', '1', ':', '8', '2', '3', '0', ']', 'V', 'L',
 'I', 'E', 'ㅋ', 'ㅖ', '(', 'ㅠ', ')', '5', 'ㅝ', 'ㅐ', 'ㅆ', "'", 'ㅀ', 'ㅊ',
 't', 'x', 'ㅙ', 'ㅚ', 'ㅉ', 'ㅍ', 'ㅄ', '?', 'g', 'i', 'f', 'ㅛ', '6', '7', '☀',
 'ㄲ', 'v', 's', 'ㅘ', '!', 'ㄶ', 'p', 'c', 'ㄼ', '\u3000', 'k', '4', '9', ',',
 'ㅞ', 'ㅒ', '“', '”', 'N', '‘', '’', 'T', 'O', 'a', 'r', 'm', 'S', '+', 'o', 'd',
 'l', 'u', '·', '~', '/', 'ㄻ', '^', 'ㄺ', 'e', 'n', 'A', '-', 'D', '&', 'C',
 'F', 'j', 'M', 'K', '"', '_', 'Z', 'X', 'U', '…', 'ㄾ', 'w', '=', 'z',
 '>', '<', 'b', 'H', '@', '*', 'W', 'y', 'h', 'R', '%', 'ㄽ', '．',
 'ｊ', 'ｐ', 'ｇ', 'ㄵ', '{', '}', 'q', 'Y', 'Q',
 '$', 'ㄿ', '？', 'ㆍ', 'ㄳ', '⋅', '—']

vocab_dict = {c: i for i, c in enumerate(vocab, 1)}

In [4]:
UNK = 0
PAD = len(vocab_dict) + 1

def preprocessing(char_list):
    ret = [vocab_dict[char] if char in vocab_dict else UNK for char in char_list]
    if len(ret) <= 100:
        ret += [PAD] * (100 - len(ret))
    else:
        ret = ret[:100]
    return ret

In [5]:
df_normals = pd.read_csv('models/normals.txt', sep='\t', names=['text', 'label'])
df_swears = pd.read_csv('models/swears.txt', sep='\t', names=['text', 'label'])

In [6]:
df_normals['text'] = df_normals['text'].apply(lambda x: j2hcj(h2j(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: j2hcj(h2j(x)))

In [7]:
df_normals['text'] = df_normals['text'].apply(lambda x: preprocessing(list(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: preprocessing(list(x)))

In [8]:
df_normals_sampled = df_normals.sample(n=len(df_swears), random_state=2020)

In [9]:
df_total = pd.concat([df_normals_sampled, df_swears], ignore_index=True)
df_total

Unnamed: 0,text,label
0,"[61, 45, 31, 10, 11, 12, 7, 4, 10, 18, 4, 2, 3...",0
1,"[8, 12, 6, 51, 13, 16, 17, 18, 6, 2, 3, 21, 63...",0
2,"[8, 45, 3, 14, 30, 2, 47, 47, 47, 47, 159, 159...",0
3,"[6, 10, 4, 13, 18, 10, 8, 20, 9, 12, 13, 4, 14...",0
4,"[61, 12, 3, 8, 67, 3, 14, 3, 13, 61, 14, 7, 17...",0
...,...,...
4957,"[6, 74, 4, 11, 12, 3, 11, 2, 13, 11, 17, 3, 3,...",1
4958,"[18, 10, 8, 5, 7, 8, 12, 13, 11, 3, 13, 31, 10...",1
4959,"[11, 4, 2, 31, 4, 19, 11, 17, 16, 13, 16, 2, 7...",1
4960,"[3, 12, 13, 8, 10, 7, 4, 5, 7, 8, 12, 13, 11, ...",1


In [10]:
df_train, df_test = train_test_split(df_total, test_size=0.2, stratify=df_total['label'])

In [11]:
df_train

Unnamed: 0,text,label
1374,"[8, 17, 4, 31, 17, 105, 9, 12, 8, 17, 7, 7, 2,...",0
987,"[55, 5, 8, 10, 4, 1, 17, 1, 3, 14, 3, 1, 2, 13...",0
346,"[4, 12, 16, 61, 17, 18, 12, 84, 13, 4, 74, 3, ...",0
234,"[18, 12, 1, 14, 33, 36, 38, 38, 8, 12, 3, 1, 1...",0
1942,"[9, 2, 3, 11, 12, 11, 5, 16, 20, 3, 13, 1, 10,...",0
...,...,...
3869,"[61, 14, 18, 19, 13, 8, 2, 3, 16, 5, 8, 20, 8,...",1
4110,"[139, 26, 27, 8, 19, 13, 6, 12, 7, 7, 10, 13, ...",1
4113,"[55, 51, 7, 7, 12, 3, 11, 12, 13, 8, 12, 9, 11...",1
2327,"[8, 10, 7, 20, 52, 8, 14, 7, 28, 51, 13, 8, 20...",0


In [12]:
xtrain, ytrain = np.vstack(df_train['text']), tf.one_hot(np.array(df_train['label']), depth=2)
xtest, ytest = np.vstack(df_test['text']), tf.one_hot(np.array(df_test['label']), depth=2)

TypeError: _vhstack_dispatcher() got an unexpected keyword argument 'dtype'

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((xtrain,ytrain)).batch(32)
test_ds =tf.data.Dataset.from_tensor_slices((xtest,ytest)).batch(32)

In [None]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=64, input_length=100)
        self.conv1 = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(100,64))
        self.pool1 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.conv2 = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(96,32))
        self.pool2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.conv3 = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(92,16))
        self.pool3 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.flat = tf.keras.layers.Flatten(input_shape=(84, 8))
        self.dense = tf.keras.layers.Dense(128, activation='relu')
        self.classifier = tf.keras.layers.Dense(2, activation='softmax')
    
    def call(self, x):
        x = self.emb(x)
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.pool3(x)
        x = self.flat(x)
        x = self.dense(x)
        return self.classifier(x)

model = Model()

In [None]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [None]:
@tf.function
def train_step(documents, labels):
    with tf.GradientTape() as tape:
        predictions = model(documents)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [None]:
@tf.function
def test_step(documents, labels):
    predictions = model(documents)
    loss = loss_object(labels, predictions)
    
    test_loss(loss)
    test_accuracy(labels, predictions)

In [None]:
EPOCHS = 100

for epoch in range(EPOCHS):
    for documents, labels in train_ds:
        train_step(documents, labels)
    for documents, labels in test_ds:
        test_step(documents, labels)
    
    if epoch % 10 == 0:
        template = "[EPOCH {}/{}], LOSS: {}, ACCURACY: {}, TEST_LOSS: {}, TEST_ACCURACY: {}"
        print(template.format(
            epoch+1,
            EPOCHS,
            train_loss.result(),
            train_accuracy.result()*100,
            test_loss.result(),
            test_accuracy.result()*100
        ))

In [None]:
def predict(text=None):
    if text is None:
        text = input()
    x = j2hcj(h2j(text))
    x = preprocessing(list(x))
    prediction = model.predict(np.array([x]))
    if prediction[0][0] > prediction[0][1]:
        print('정상입니다.')
    else:
        print('욕입니다.')
    print(prediction)

In [None]:
predict()

In [None]:
model.save('models/model')

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model('models/model')
tflite_model = converter.convert()

with open('models/model.tflite', 'wb') as f:
    f.write(tflite_model)