In [1]:
from jamo import h2j, j2hcj
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [2]:
import tensorflow as tf
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [3]:
vocab = ['ㄷ', 'ㅏ', 'ㄴ', 'ㄱ', 'ㅜ', 'ㅎ', 'ㄹ', 'ㅇ', 'ㅂ', 'ㅓ', 'ㅈ', 'ㅣ', ' ',
 'ㅡ', 'ㅢ', 'ㅁ', 'ㅗ', 'ㅅ', 'ㅔ', 'ㅕ', 'ㅑ', ';', 'B', 'J', '.', 'P', 'G',
 'ㄸ', 'ㅟ', 'ㅃ', 'ㅌ', '[', '1', ':', '8', '2', '3', '0', ']', 'V', 'L',
 'I', 'E', 'ㅋ', 'ㅖ', '(', 'ㅠ', ')', '5', 'ㅝ', 'ㅐ', 'ㅆ', "'", 'ㅀ', 'ㅊ',
 't', 'x', 'ㅙ', 'ㅚ', 'ㅉ', 'ㅍ', 'ㅄ', '?', 'g', 'i', 'f', 'ㅛ', '6', '7', '☀',
 'ㄲ', 'v', 's', 'ㅘ', '!', 'ㄶ', 'p', 'c', 'ㄼ', '\u3000', 'k', '4', '9', ',',
 'ㅞ', 'ㅒ', '“', '”', 'N', '‘', '’', 'T', 'O', 'a', 'r', 'm', 'S', '+', 'o', 'd',
 'l', 'u', '·', '~', '/', 'ㄻ', '^', 'ㄺ', 'e', 'n', 'A', '-', 'D', '&', 'C',
 'F', 'j', 'M', 'K', '"', '_', 'Z', 'X', 'U', '…', 'ㄾ', 'w', '=', 'z',
 '>', '<', 'b', 'H', '@', '*', 'W', 'y', 'h', 'R', '%', 'ㄽ', '．',
 'ｊ', 'ｐ', 'ｇ', 'ㄵ', '{', '}', 'q', 'Y', 'Q',
 '$', 'ㄿ', '？', 'ㆍ', 'ㄳ', '⋅', '—']

vocab_dict = {c: i for i, c in enumerate(vocab, 1)}

In [4]:
UNK = 0
PAD = len(vocab_dict) + 1

def preprocessing(char_list):
    ret = [vocab_dict[char] if char in vocab_dict else UNK for char in char_list]
    if len(ret) <= 100:
        ret += [PAD] * (100 - len(ret))
    else:
        ret = ret[:100]
    return ret

In [5]:
df_normals = pd.read_csv('models/normals.txt', sep='\t', names=['text', 'label'])
df_swears = pd.read_csv('models/swears.txt', sep='\t', names=['text', 'label'])

In [6]:
df_normals['text'] = df_normals['text'].apply(lambda x: j2hcj(h2j(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: j2hcj(h2j(x)))

In [7]:
df_normals['text'] = df_normals['text'].apply(lambda x: preprocessing(list(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: preprocessing(list(x)))

In [8]:
df_normals_sampled = df_normals.sample(n=len(df_swears), random_state=2020)

In [9]:
df_total = pd.concat([df_normals_sampled, df_swears], ignore_index=True)
df_total

Unnamed: 0,text,label
0,"[6, 51, 8, 11, 10, 8, 18, 2, 7, 17, 13, 18, 2,...",0
1,"[18, 67, 3, 17, 18, 13, 55, 29, 18, 17, 13, 16...",0
2,"[1, 51, 4, 12, 8, 10, 9, 1, 17, 13, 18, 12, 3,...",0
3,"[6, 19, 8, 14, 8, 14, 8, 159, 159, 159, 159, 1...",0
4,"[4, 51, 7, 13, 16, 2, 4, 51, 7, 8, 12, 8, 10, ...",0
...,...,...
4977,"[8, 2, 18, 12, 9, 2, 7, 13, 4, 10, 3, 11, 17, ...",1
4978,"[8, 74, 13, 52, 12, 9, 2, 7, 13, 1, 14, 1, 12,...",1
4979,"[4, 14, 13, 18, 10, 8, 1, 2, 3, 13, 11, 17, 11...",1
4980,"[4, 2, 9, 11, 2, 4, 12, 13, 8, 12, 71, 4, 51, ...",1


In [10]:
df_train, df_test = train_test_split(df_total, test_size=0.2, stratify=df_total['label'])

In [11]:
df_train

Unnamed: 0,text,label
2519,"[8, 2, 18, 12, 9, 2, 7, 13, 8, 12, 16, 8, 67, ...",1
1510,"[61, 10, 11, 12, 7, 17, 13, 49, 81, 13, 60, 12...",0
2135,"[4, 17, 7, 61, 2, 4, 45, 11, 10, 8, 13, 61, 2,...",0
776,"[16, 50, 3, 13, 4, 19, 16, 6, 2, 3, 17, 159, 1...",0
1855,"[3, 5, 9, 19, 7, 14, 13, 4, 10, 16, 1, 17, 7, ...",0
...,...,...
2541,"[8, 20, 6, 51, 8, 11, 12, 4, 2, 16, 20, 3, 13,...",1
3127,"[4, 10, 7, 7, 19, 8, 20, 1, 17, 13, 8, 10, 7, ...",1
905,"[1, 5, 9, 4, 2, 52, 12, 4, 51, 7, 8, 12, 9, 4,...",0
1773,"[6, 74, 4, 11, 12, 3, 3, 2, 18, 10, 13, 4, 20,...",0


In [12]:
xtrain, ytrain = np.vstack(df_train['text']), tf.one_hot(np.array(df_train['label']), depth=2)
xtest, ytest = np.vstack(df_test['text']), tf.one_hot(np.array(df_test['label']), depth=2)

In [13]:
train_ds = tf.data.Dataset.from_tensor_slices((xtrain,ytrain)).batch(32)
test_ds =tf.data.Dataset.from_tensor_slices((xtest,ytest)).batch(32)

In [56]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=64, input_length=100)
        self.conv1 = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(100,64))
        self.pool1 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.conv2 = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(96,32))
        self.pool2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.conv3 = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(92,16))
        self.pool3 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        self.flat = tf.keras.layers.Flatten(input_shape=(84, 8))
        self.dense = tf.keras.layers.Dense(128, activation='relu')
        self.classifier = tf.keras.layers.Dense(2, activation='softmax')
    
    def call(self, x):
        x = self.emb(x)
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.pool3(x)
        x = self.flat(x)
        x = self.dense(x)
        return self.classifier(x)
    
model = Model()

In [134]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=128, input_length=100)
        
        self.conv1 = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128))
        self.conv2 = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(96, 64))
        self.conv3 = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(92, 32))
        
        self.conv_a_q = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(88, 16))
        self.conv_a_k = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(88, 16))
        self.conv_a_v = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(88, 16))
        
        self.conv_b_q = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(88, 16))
        self.conv_b_k = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(88, 16))
        self.conv_b_v = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(88, 16))

        self.pool = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, data_format="channels_first")
        
        self.flat = tf.keras.layers.Flatten(input_shape=(88, 16))
        self.dense1 = tf.keras.layers.Dense(512, activation='relu')
        self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.classifier = tf.keras.layers.Dense(2, activation='softmax')
        
    def call(self, x):
        x = self.emb(x)
        
        x = self.conv1(x)
        x = self.pool(x)

        x = self.conv2(x)
        x = self.pool(x)
        
        x = self.conv3(x)
        x = self.pool(x)
        
        x_a_q = self.conv_a_q(x)
        x_a_k = self.conv_a_k(x)
        x_a_v = self.conv_a_v(x)
        scaled_dot_a = tf.math.reduce_sum(tf.math.multiply(x_a_q, x_a_v), axis=2) / math.sqrt(16)
        print(scaled_dot_a.shape)
        a_a = tf.nn.softmax(scaled_dot_a, axis=1)
        print(a_a.shape)
        x_a = tf.math.multiply(a_a, x_a_v)
        
        x_b_q = self.conv_b_q(x)
        x_b_k = self.conv_b_k(x)
        x_b_v = self.conv_b_v(x)
        scaled_dot_b = tf.math.reduce_sum(tf.math.multiply(x_b_q, x_b_v), axis=2) / math.sqrt(16)
        a_b = tf.nn.softmax(scaled_dot_b, axis=1)
        x_b = tf.math.multiply(a_b, x_b_v)
        
        x = tf.math.multiply(x_a, x_b)

        x = self.flat(x)
        x = self.dense1(x)
        x = self.dense2(x)
        return self.classifier(x)
    
model = Model()

In [135]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

In [136]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [137]:
@tf.function
def train_step(documents, labels):
    with tf.GradientTape() as tape:
        predictions = model(documents)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [138]:
@tf.function
def test_step(documents, labels):
    predictions = model(documents)
    loss = loss_object(labels, predictions)
    
    test_loss(loss)
    test_accuracy(labels, predictions)

In [139]:
EPOCHS = 100

for epoch in range(EPOCHS):
    for documents, labels in train_ds:
        train_step(documents, labels)
    for documents, labels in test_ds:
        test_step(documents, labels)
    
    if epoch % 10 == 0:
        template = "[EPOCH {}/{}], LOSS: {}, ACCURACY: {}, TEST_LOSS: {}, TEST_ACCURACY: {}"
        print(template.format(
            epoch+1,
            EPOCHS,
            train_loss.result(),
            train_accuracy.result()*100,
            test_loss.result(),
            test_accuracy.result()*100
        ))

(32, 84)
(32, 84)


ValueError: in user code:

    <ipython-input-137-a4be6247b1aa>:4 train_step  *
        predictions = model(documents)
    <ipython-input-134-2875581bed3b>:44 call  *
        x_a = tf.math.multiply(a_a, x_a_v)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:509 multiply
        return gen_math_ops.mul(x, y, name)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_math_ops.py:6176 mul
        "Mul", x=x, y=y, name=name)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py:593 _create_op_internal
        compute_device)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:3485 _create_op_internal
        op_def=op_def)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1975 __init__
        control_input_ops, op_def)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 32 and 84 for '{{node model_18/Mul_1}} = Mul[T=DT_FLOAT](model_18/Softmax, model_18/conv1d_161/Relu)' with input shapes: [32,84], [32,84,16].


In [None]:
def predict(text=None):
    if text is None:
        text = input()
    x = j2hcj(h2j(text))
    x = preprocessing(list(x))
    prediction = model.predict(np.array([x]))
    if prediction[0][0] > prediction[0][1]:
        print('정상입니다.')
    else:
        print('욕입니다.')
    print(prediction)

In [None]:
predict()

In [None]:
model.save('models/model')

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model('models/model')
tflite_model = converter.convert()

with open('models/model.tflite', 'wb') as f:
    f.write(tflite_model)