In [1]:
from jamo import h2j, j2hcj
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [2]:
import tensorflow as tf
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [3]:
vocab = ['ㄷ', 'ㅏ', 'ㄴ', 'ㄱ', 'ㅜ', 'ㅎ', 'ㄹ', 'ㅇ', 'ㅂ', 'ㅓ', 'ㅈ', 'ㅣ', ' ',
 'ㅡ', 'ㅢ', 'ㅁ', 'ㅗ', 'ㅅ', 'ㅔ', 'ㅕ', 'ㅑ', ';', 'B', 'J', '.', 'P', 'G',
 'ㄸ', 'ㅟ', 'ㅃ', 'ㅌ', '[', '1', ':', '8', '2', '3', '0', ']', 'V', 'L',
 'I', 'E', 'ㅋ', 'ㅖ', '(', 'ㅠ', ')', '5', 'ㅝ', 'ㅐ', 'ㅆ', "'", 'ㅀ', 'ㅊ',
 't', 'x', 'ㅙ', 'ㅚ', 'ㅉ', 'ㅍ', 'ㅄ', '?', 'g', 'i', 'f', 'ㅛ', '6', '7', '☀',
 'ㄲ', 'v', 's', 'ㅘ', '!', 'ㄶ', 'p', 'c', 'ㄼ', '\u3000', 'k', '4', '9', ',',
 'ㅞ', 'ㅒ', '“', '”', 'N', '‘', '’', 'T', 'O', 'a', 'r', 'm', 'S', '+', 'o', 'd',
 'l', 'u', '·', '~', '/', 'ㄻ', '^', 'ㄺ', 'e', 'n', 'A', '-', 'D', '&', 'C',
 'F', 'j', 'M', 'K', '"', '_', 'Z', 'X', 'U', '…', 'ㄾ', 'w', '=', 'z',
 '>', '<', 'b', 'H', '@', '*', 'W', 'y', 'h', 'R', '%', 'ㄽ', '．',
 'ｊ', 'ｐ', 'ｇ', 'ㄵ', '{', '}', 'q', 'Y', 'Q',
 '$', 'ㄿ', '？', 'ㆍ', 'ㄳ', '⋅', '—']

vocab_dict = {c: i for i, c in enumerate(vocab, 1)}

In [4]:
UNK = 0
PAD = len(vocab_dict) + 1

def preprocessing(char_list):
    ret = [vocab_dict[char] if char in vocab_dict else UNK for char in char_list]
    if len(ret) <= 100:
        ret += [PAD] * (100 - len(ret))
    else:
        ret = ret[:100]
    return ret

In [5]:
df_normals = pd.read_csv('models/normals.txt', sep='\t', names=['text', 'label'])
df_swears = pd.read_csv('models/swears.txt', sep='\t', names=['text', 'label'])

In [6]:
df_normals['text'] = df_normals['text'].apply(lambda x: j2hcj(h2j(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: j2hcj(h2j(x)))

In [7]:
df_normals['text'] = df_normals['text'].apply(lambda x: preprocessing(list(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: preprocessing(list(x)))

In [8]:
df_normals_sampled = df_normals.sample(n=len(df_swears), random_state=2020)

In [9]:
df_total = pd.concat([df_normals_sampled, df_swears], ignore_index=True)
df_total

Unnamed: 0,text,label
0,"[6, 51, 8, 11, 10, 8, 18, 2, 7, 17, 13, 18, 2,...",0
1,"[18, 67, 3, 17, 18, 13, 55, 29, 18, 17, 13, 16...",0
2,"[1, 51, 4, 12, 8, 10, 9, 1, 17, 13, 18, 12, 3,...",0
3,"[6, 19, 8, 14, 8, 14, 8, 159, 159, 159, 159, 1...",0
4,"[4, 51, 7, 13, 16, 2, 4, 51, 7, 8, 12, 8, 10, ...",0
...,...,...
4977,"[8, 2, 18, 12, 9, 2, 7, 13, 4, 10, 3, 11, 17, ...",1
4978,"[8, 74, 13, 52, 12, 9, 2, 7, 13, 1, 14, 1, 12,...",1
4979,"[4, 14, 13, 18, 10, 8, 1, 2, 3, 13, 11, 17, 11...",1
4980,"[4, 2, 9, 11, 2, 4, 12, 13, 8, 12, 71, 4, 51, ...",1


In [10]:
df_train, df_test = train_test_split(df_total, test_size=0.2, stratify=df_total['label'])

In [11]:
df_train

Unnamed: 0,text,label
1607,"[8, 21, 8, 12, 13, 11, 2, 18, 12, 4, 1, 14, 7,...",0
1462,"[4, 2, 9, 11, 2, 4, 12, 13, 8, 2, 3, 55, 12, 8...",0
113,"[11, 19, 9, 2, 7, 13, 44, 51, 7, 12, 13, 28, 2...",0
2111,"[55, 5, 7, 1, 17, 8, 8, 19, 8, 10, 8, 5, 7, 61...",0
2763,"[11, 12, 18, 14, 31, 2, 8, 19, 13, 28, 17, 8, ...",1
...,...,...
1342,"[8, 12, 61, 8, 12, 13, 16, 2, 7, 7, 12, 3, 14,...",0
277,"[71, 12, 18, 5, 3, 8, 12, 13, 1, 47, 8, 17, 13...",0
4087,"[8, 74, 13, 18, 12, 9, 2, 13, 4, 21, 7, 2, 1, ...",1
956,"[73, 138, 94, 100, 99, 127, 13, 18, 10, 7, 16,...",0


In [12]:
xtrain, ytrain = np.vstack(df_train['text']), tf.one_hot(np.array(df_train['label']), depth=2)
xtest, ytest = np.vstack(df_test['text']), tf.one_hot(np.array(df_test['label']), depth=2)

In [13]:
train_ds = tf.data.Dataset.from_tensor_slices((xtrain,ytrain)).batch(32)
test_ds =tf.data.Dataset.from_tensor_slices((xtest,ytest)).batch(32)

In [20]:
# class Model(tf.keras.Model):
#     def __init__(self):
#         super(Model, self).__init__()
#         self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=64, input_length=100)
#         self.conv1 = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(100,64))
#         self.pool1 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
#         self.conv2 = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(96,32))
#         self.pool2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
#         self.conv3 = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(92,16))
#         self.pool3 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
#         self.flat = tf.keras.layers.Flatten(input_shape=(84, 8))
#         self.dense = tf.keras.layers.Dense(128, activation='relu')
#         self.classifier = tf.keras.layers.Dense(2, activation='softmax')
    
#     def call(self, x):
#         x = self.emb(x)
#         x = self.conv1(x)
#         x = self.pool1(x)
#         x = self.conv2(x)
#         x = self.pool2(x)
#         x = self.conv3(x)
#         x = self.pool3(x)
#         x = self.flat(x)
#         x = self.dense(x)
#         return self.classifier(x)
    
# model = Model()

In [21]:
# class ConvModel(tf.keras.Model):
#     def __init__(self, input_dim):
#         super(ConvModel, self).__init__()
#         self.dim = input_dim
#         self.conv_q = tf.keras.layers.Conv1D(input_dim[1], 5, activation='relu', input_shape=input_dim)
#         self.conv_k = tf.keras.layers.Conv1D(input_dim[1], 5, activation='relu', input_shape=input_dim)
#         self.conv_v = tf.keras.layers.Conv1D(input_dim[1], 5, activation='relu', input_shape=input_dim)
#         self.pool = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        
#     def call(self, x):
#         x_q = self.conv_q(x)
#         x_k = self.conv_k(x)
#         x_v = self.conv_v(x)
#         scaled_dot = tf.math.multiply(x_q, x_k) / math.sqrt(self.dim[1])
#         a = tf.nn.softmax(scaled_dot, axis=1)
#         return self.pool(a * x_v)

In [28]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=128, input_length=100)
        
        self.conv1_q = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128))
        self.conv1_k = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128))
        self.conv1_v = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128))
        
        self.conv2_q = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(96, 64))
        self.conv2_k = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(96, 64))
        self.conv2_v = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(96, 64))

        self.pool = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, data_format="channels_first")
        
        self.flat = tf.keras.layers.Flatten(input_shape=(92, 32))
        self.dense1 = tf.keras.layers.Dense(512, activation='relu')
        self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.classifier = tf.keras.layers.Dense(2, activation='softmax')
        
    def call(self, x):
        x = self.emb(x)
        
        x_q = self.conv1_q(x)
        x_k = self.conv1_k(x)
        x_v = self.conv1_v(x)
        scaled_dot = tf.math.multiply(x_q, x_k) / math.sqrt(128)
        a = tf.nn.softmax(scaled_dot, axis=1)
        x = self.pool(a * x_v)
        
        x_q = self.conv2_q(x)
        x_k = self.conv2_k(x)
        x_v = self.conv2_v(x)
        scaled_dot = tf.math.multiply(x_q, x_k) / math.sqrt(64)
        a = tf.nn.softmax(scaled_dot, axis=1)
        x = self.pool(a * x_v)

        x = self.flat(x)
        x = self.dense1(x)
        x = self.dense2(x)
        return self.classifier(x)
    
model = Model()

In [29]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

In [30]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [31]:
@tf.function
def train_step(documents, labels):
    with tf.GradientTape() as tape:
        predictions = model(documents)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [32]:
@tf.function
def test_step(documents, labels):
    predictions = model(documents)
    loss = loss_object(labels, predictions)
    
    test_loss(loss)
    test_accuracy(labels, predictions)

In [33]:
EPOCHS = 100

for epoch in range(EPOCHS):
    for documents, labels in train_ds:
        train_step(documents, labels)
    for documents, labels in test_ds:
        test_step(documents, labels)
    
    if epoch % 10 == 0:
        template = "[EPOCH {}/{}], LOSS: {}, ACCURACY: {}, TEST_LOSS: {}, TEST_ACCURACY: {}"
        print(template.format(
            epoch+1,
            EPOCHS,
            train_loss.result(),
            train_accuracy.result()*100,
            test_loss.result(),
            test_accuracy.result()*100
        ))

[EPOCH 1/100], LOSS: 0.7241511940956116, ACCURACY: 49.20953369140625, TEST_LOSS: 0.7240978479385376, TEST_ACCURACY: 49.94984817504883
[EPOCH 11/100], LOSS: 0.7192822694778442, ACCURACY: 51.13265609741211, TEST_LOSS: 0.7154316902160645, TEST_ACCURACY: 52.79475021362305
[EPOCH 21/100], LOSS: 0.6703925132751465, ACCURACY: 63.528709411621094, TEST_LOSS: 0.672185480594635, TEST_ACCURACY: 63.8725700378418
[EPOCH 31/100], LOSS: 0.6423004865646362, ACCURACY: 70.10725402832031, TEST_LOSS: 0.651398777961731, TEST_ACCURACY: 68.84201049804688
[EPOCH 41/100], LOSS: 0.6247989535331726, ACCURACY: 74.10594940185547, TEST_LOSS: 0.6396171450614929, TEST_ACCURACY: 71.61239624023438
[EPOCH 51/100], LOSS: 0.6130536794662476, ACCURACY: 76.75105285644531, TEST_LOSS: 0.6323032975196838, TEST_ACCURACY: 73.30619049072266
[EPOCH 61/100], LOSS: 0.6040775775909424, ACCURACY: 78.74899291992188, TEST_LOSS: 0.6264773011207581, TEST_ACCURACY: 74.75376892089844
[EPOCH 71/100], LOSS: 0.5974025726318359, ACCURACY: 80.233

In [None]:
def predict(text=None):
    if text is None:
        text = input()
    x = j2hcj(h2j(text))
    x = preprocessing(list(x))
    prediction = model.predict(np.array([x]))
    if prediction[0][0] > prediction[0][1]:
        print('정상입니다.')
    else:
        print('욕입니다.')
    print(prediction)

In [None]:
predict()

In [None]:
model.save('models/model')

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model('models/model')
tflite_model = converter.convert()

with open('models/model.tflite', 'wb') as f:
    f.write(tflite_model)