In [1]:
from jamo import h2j, j2hcj
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import math

In [2]:
import tensorflow as tf
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [3]:
vocab = ['ㄷ', 'ㅏ', 'ㄴ', 'ㄱ', 'ㅜ', 'ㅎ', 'ㄹ', 'ㅇ', 'ㅂ', 'ㅓ', 'ㅈ', 'ㅣ', ' ',
 'ㅡ', 'ㅢ', 'ㅁ', 'ㅗ', 'ㅅ', 'ㅔ', 'ㅕ', 'ㅑ', ';', 'B', 'J', '.', 'P', 'G',
 'ㄸ', 'ㅟ', 'ㅃ', 'ㅌ', '[', '1', ':', '8', '2', '3', '0', ']', 'V', 'L',
 'I', 'E', 'ㅋ', 'ㅖ', '(', 'ㅠ', ')', '5', 'ㅝ', 'ㅐ', 'ㅆ', "'", 'ㅀ', 'ㅊ',
 't', 'x', 'ㅙ', 'ㅚ', 'ㅉ', 'ㅍ', 'ㅄ', '?', 'g', 'i', 'f', 'ㅛ', '6', '7', '☀',
 'ㄲ', 'v', 's', 'ㅘ', '!', 'ㄶ', 'p', 'c', 'ㄼ', '\u3000', 'k', '4', '9', ',',
 'ㅞ', 'ㅒ', '“', '”', 'N', '‘', '’', 'T', 'O', 'a', 'r', 'm', 'S', '+', 'o', 'd',
 'l', 'u', '·', '~', '/', 'ㄻ', '^', 'ㄺ', 'e', 'n', 'A', '-', 'D', '&', 'C',
 'F', 'j', 'M', 'K', '"', '_', 'Z', 'X', 'U', '…', 'ㄾ', 'w', '=', 'z',
 '>', '<', 'b', 'H', '@', '*', 'W', 'y', 'h', 'R', '%', 'ㄽ', '．',
 'ｊ', 'ｐ', 'ｇ', 'ㄵ', '{', '}', 'q', 'Y', 'Q',
 '$', 'ㄿ', '？', 'ㆍ', 'ㄳ', '⋅', '—']

vocab_dict = {c: i for i, c in enumerate(vocab, 1)}

In [4]:
UNK = 0
PAD = len(vocab_dict) + 1

def preprocessing(char_list):
    ret = [vocab_dict[char] if char in vocab_dict else UNK for char in char_list]
    if len(ret) <= 100:
        ret += [PAD] * (100 - len(ret))
    else:
        ret = ret[:100]
    return ret

In [5]:
df_normals = pd.read_csv('models/normals.txt', sep='\t', names=['text', 'label'])
df_swears = pd.read_csv('models/swears.txt', sep='\t', names=['text', 'label'])

In [6]:
df_normals['text'] = df_normals['text'].apply(lambda x: j2hcj(h2j(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: j2hcj(h2j(x)))

In [7]:
df_normals['text'] = df_normals['text'].apply(lambda x: preprocessing(list(x)))
df_swears['text'] = df_swears['text'].apply(lambda x: preprocessing(list(x)))

In [8]:
df_normals_sampled = df_normals.sample(n=len(df_swears), random_state=2020)

In [9]:
df_total = pd.concat([df_normals_sampled, df_swears], ignore_index=True)
df_total

Unnamed: 0,text,label
0,"[4, 17, 8, 21, 8, 8, 12, 13, 8, 12, 9, 13, 11,...",0
1,"[9, 51, 31, 14, 7, 61, 12, 7, 1, 14, 82, 13, 1...",0
2,"[3, 2, 8, 12, 16, 10, 4, 8, 14, 7, 18, 5, 7, 1...",0
3,"[3, 2, 11, 5, 8, 8, 19, 13, 38, 25, 49, 3, 2, ...",0
4,"[44, 17, 7, 8, 17, 9, 13, 7, 12, 16, 2, 18, 14...",0
...,...,...
5573,"[4, 19, 18, 4, 19, 7, 13, 11, 9, 2, 9, 1, 14, ...",1
5574,"[4, 19, 18, 4, 51, 7, 1, 17, 13, 11, 17, 55, 1...",1
5575,"[4, 19, 18, 8, 51, 16, 13, 8, 47, 11, 10, 13, ...",1
5576,"[11, 17, 11, 3, 47, 9, 12, 13, 55, 17, 9, 17, ...",1


In [10]:
df_train, df_test = train_test_split(df_total, test_size=0.2, stratify=df_total['label'])

In [11]:
df_train

Unnamed: 0,text,label
374,"[16, 51, 9, 12, 3, 17, 4, 12, 93, 89, 159, 159...",0
4745,"[11, 67, 11, 18, 47, 13, 92, 97, 159, 159, 159...",1
204,"[9, 12, 55, 55, 51, 6, 51, 1, 14, 3, 7, 2, 6, ...",0
2329,"[3, 2, 3, 13, 31, 14, 3, 12, 11, 74, 13, 16, 5...",0
832,"[4, 2, 9, 11, 2, 4, 12, 13, 4, 5, 8, 4, 14, 16...",0
...,...,...
2233,"[9, 19, 3, 31, 5, 13, 9, 17, 31, 14, 13, 31, 5...",0
1615,"[4, 14, 3, 1, 19, 13, 18, 14, 16, 13, 28, 17, ...",0
2971,"[7, 19, 9, 12, 8, 2, 11, 10, 11, 31, 51, 8, 8,...",1
1572,"[11, 2, 7, 2, 3, 12, 4, 2, 13, 8, 50, 3, 7, 5,...",0


In [12]:
xtrain, ytrain = np.vstack(df_train['text']), tf.one_hot(np.array(df_train['label']), depth=2)
xtest, ytest = np.vstack(df_test['text']), tf.one_hot(np.array(df_test['label']), depth=2)

In [13]:
train_ds = tf.data.Dataset.from_tensor_slices((xtrain,ytrain)).batch(32)
test_ds =tf.data.Dataset.from_tensor_slices((xtest,ytest)).batch(32)

In [14]:
# class Model(tf.keras.Model):
#     def __init__(self):
#         super(Model, self).__init__()
#         self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=64, input_length=100)
#         self.conv1 = tf.keras.layers.Conv1D(64, 5, activation='relu', input_shape=(100,64))
#         self.pool1 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
#         self.conv2 = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(96,32))
#         self.pool2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
#         self.conv3 = tf.keras.layers.Conv1D(16, 5, activation='relu', input_shape=(92,16))
#         self.pool3 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
#         self.flat = tf.keras.layers.Flatten(input_shape=(84, 8))
#         self.dense = tf.keras.layers.Dense(128, activation='relu')
#         self.classifier = tf.keras.layers.Dense(2, activation='softmax')
    
#     def call(self, x):
#         x = self.emb(x)
#         x = self.conv1(x)
#         x = self.pool1(x)
#         x = self.conv2(x)
#         x = self.pool2(x)
#         x = self.conv3(x)
#         x = self.pool3(x)
#         x = self.flat(x)
#         x = self.dense(x)
#         return self.classifier(x)
    
# model = Model()

In [15]:
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.emb = tf.keras.layers.Embedding(input_dim=160, output_dim=128, input_length=100)
        
        self.conv1_a = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128), padding='same')
        self.conv1_b = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128), padding='same')
        self.conv1_c = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(100, 128))
        
        self.conv2_a = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(48, 128), padding='same')
        self.conv2_b = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(48, 128), padding='same')
        self.conv2_c = tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape=(48, 128))
        
#         self.conv3_a = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(92, 32), padding='same')
#         self.conv3_b = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(92, 32), padding='same')
#         self.conv3_c = tf.keras.layers.Conv1D(32, 5, activation='relu', input_shape=(92, 32))
        
        self.conv_a_q = tf.keras.layers.Conv1D(128, 5, activation='elu', input_shape=(22, 128))
        self.conv_a_k = tf.keras.layers.Conv1D(128, 5, activation='elu', input_shape=(22, 128))
        self.conv_a_v = tf.keras.layers.Conv1D(128, 5, activation='elu', input_shape=(22, 128))
        
        self.conv_b_q = tf.keras.layers.Conv1D(128, 5, activation='elu', input_shape=(22, 128))
        self.conv_b_k = tf.keras.layers.Conv1D(128, 5, activation='elu', input_shape=(22, 128))
        self.conv_b_v = tf.keras.layers.Conv1D(128, 5, activation='tanh', input_shape=(22, 128))
        # 88x16 => 22x128

        self.pool = tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)
        
        # self.flat = tf.keras.layers.Flatten(input_shape=(32, 16))
        self.dense = tf.keras.layers.Dense(32, activation='relu')
        #self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.classifier = tf.keras.layers.Dense(2, activation='softmax')
        
    def call(self, x):
        # [batch, 100] -> [batch, 100, 128]
        x = self.emb(x)
        #print(x.shape)
        # [batch, 100, 128] -> [batch, 96, 128]
        x = self.conv1_a(x)
        x = self.conv1_b(x)
        x = self.conv1_c(x)
        #print(x.shape)
        # [batch, 96, 128] -> [batch, 48, 128]
        x = self.pool(x)
        #print(x.shape)
        # [batch, 48, 128] -> [batch, 44, 128]
        x = self.conv2_a(x)
        x = self.conv2_b(x)
        x = self.conv2_c(x)
        #print(x.shape)
        # [batch, 44, 128] -> [batch, 22, 128]
        x = self.pool(x)
        #print(x.shape)
        
        # [batch, 22, 128] -> [batch, 128]
        x_a_q = self.conv_a_q(x)
        x_a_k = self.conv_a_k(x)
        x_a_v = self.conv_a_v(x)
        scaled_dot_a = tf.math.reduce_sum(tf.math.multiply(x_a_q, x_a_k), axis=2) / math.sqrt(128)
        a_a = tf.expand_dims(tf.nn.softmax(scaled_dot_a, axis=1), axis=-1)
        x_a = tf.math.reduce_sum(tf.math.multiply(a_a, x_a_v), axis=1)
        #print(x_a.shape)
        
        x_b_q = self.conv_b_q(x)
        x_b_k = self.conv_b_k(x)
        x_b_v = self.conv_b_v(x)
        scaled_dot_b = tf.math.reduce_sum(tf.math.multiply(x_b_q, x_b_k), axis=2) / math.sqrt(128)
        a_b = tf.expand_dims(tf.nn.softmax(scaled_dot_b, axis=1), axis=-1)
        x_b = tf.math.reduce_sum(tf.math.multiply(a_b, x_b_v), axis=1)
        
        x = tf.math.multiply(x_a, x_b)
        x = self.dense(x)
        return self.classifier(x)
    
model = Model()

In [16]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [17]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [18]:
@tf.function
def train_step(documents, labels):
    with tf.GradientTape() as tape:
        predictions = model(documents)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)

In [19]:
@tf.function
def test_step(documents, labels):
    predictions = model(documents)
    loss = loss_object(labels, predictions)
    
    test_loss(loss)
    test_accuracy(labels, predictions)

In [20]:
EPOCHS = 1000

early_stop = 0
prev_best = 0

for epoch in range(EPOCHS):
    for documents, labels in train_ds:
        train_step(documents, labels)
    for documents, labels in test_ds:
        test_step(documents, labels)
    
    if epoch % 10 == 0:
        if test_accuracy.result() > prev_best:
            prev_best = test_accuracy.result()
            early_stop = 0
            model.save('models/model')
        else:
            early_stop += 1
            if early_stop == 4:
                break
        template = "[EPOCH {}/{}], LOSS: {}, ACCURACY: {}, TEST_LOSS: {}, TEST_ACCURACY: {}"
        print(template.format(
            epoch+1,
            EPOCHS,
            train_loss.result(),
            train_accuracy.result()*100,
            test_loss.result(),
            test_accuracy.result()*100
        ))

UnknownError: 2 root error(s) found.
  (0) Unknown:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model/conv1d/conv1d (defined at <ipython-input-15-5f2c6d55368f>:39) ]]
	 [[gradient_tape/model/embedding/embedding_lookup/Reshape/_18]]
  (1) Unknown:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model/conv1d/conv1d (defined at <ipython-input-15-5f2c6d55368f>:39) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_step_2319]

Function call stack:
train_step -> train_step


In [33]:
def predict(text=None):
    if text is None:
        text = input()
    x = j2hcj(h2j(text))
    x = preprocessing(list(x))
    prediction = model.predict(np.array([x]))
    if prediction[0][0] > prediction[0][1]:
        print('정상입니다.')
    else:
        print('욕입니다.')
    print(prediction)

In [41]:
predict('똥맛')

욕입니다.
[[1.5799628e-15 1.0000000e+00]]


In [42]:
converter = tf.lite.TFLiteConverter.from_saved_model('models/model')
#converter.optimizations = [tf.lite.Optimize.DEFAULT]
# converter.experimental_new_converter = True
# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]

tflite_model = converter.convert()

with open('models/model.tflite', 'wb') as f:
    f.write(tflite_model)