In [1]:
import tensorflow as tf

import numpy as np
import os
import time
import random

from konlpy.tag import Komoran
from konlpy.tag import Mecab

In [2]:
# komoran = Komoran()
# mecab = Mecab()

In [3]:
text = open("data/clean/ts.txt", 'rb').read().decode(encoding='utf-8')
print(text[:200])

Content
화질이 왤케 안좋지..
나만그런가..
앗..
네 그거함수
편미분한거
??
ㅋㅋㅋ
줌이 영상공유는 더낫네요 ㅋㅋ
컨트롤 a 딜리트
그르게요
저게 그래서 뭐에요?
글쿠나....
마우스로 왤케 잘그리시지
??
오른쪽으로가는게 좌극한이에여?
하...
모르는 단어 넘많아
교수님 미대에서 오신듯
ㅋㅋㅋㅋㅋ
최초공개!
또 최초공개!
아 0?
zero가 되


In [4]:
print(repr(text[:200]))

'Content\n화질이 왤케 안좋지..\n나만그런가..\n앗..\n네 그거함수\n편미분한거\n??\nㅋㅋㅋ\n줌이 영상공유는 더낫네요 ㅋㅋ\n컨트롤 a 딜리트\n그르게요\n저게 그래서 뭐에요?\n글쿠나....\n마우스로 왤케 잘그리시지\n??\n오른쪽으로가는게 좌극한이에여?\n하...\n모르는 단어 넘많아\n교수님 미대에서 오신듯\nㅋㅋㅋㅋㅋ\n최초공개!\n또 최초공개!\n아 0?\nzero가 되'


In [5]:
len(text)

68813

In [6]:
vocab = sorted(set(text))
print(vocab[:10], len(vocab))

['\n', ' ', '!', '"', '#', '%', '&', "'", '(', ')'] 1163


In [7]:
char2idx = {u:i for i, u in enumerate(vocab)}
len(char2idx)

1163

In [8]:
# index -> character로 변환하는 사전 
idx2char = np.array(vocab)
idx2char[49]

'Q'

In [9]:
text_as_int = np.array([char2idx[c] for c in text])
len(text_as_int)

68813

In [10]:
for i in range(len(text_as_int)):
    if(i == 0): continue
    if(len(text_as_int) % i) == 0:
        print(i)

1


In [11]:
# X,Y 데이터셋 생성

# 단일 입력에 대해 원하는 문장의 최대 길이
window_size = 100
shuffle_buffer = len(text_as_int)
batch_size = 64

In [12]:
# Windowed Dataset을 만듭니다
def windowed_dataset(series, window_size, shuffle_buffer, batch_size):
    series = tf.expand_dims(series, -1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda x : x.batch(window_size + 1))
    ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda x: (x[:-1], x[1:]),
                num_parallel_calls=tf.data.experimental.AUTOTUNE
                )
    return ds.repeat().batch(batch_size, drop_remainder=True).prefetch(1)

In [13]:
# batch_size = 64

# train_data = tf.data.Dataset.from_tensor_slices((train_feature, train_label))

# train_data = train_data.repeat().batch(batch_size, drop_remainder=True)

# steps_per_epoch = len(train_feature) // batch_size 

# model.fit(train_data, epochs=10, steps_per_epoch = steps_per_epoch)

In [14]:
train_data = windowed_dataset(np.array(text_as_int), 
                            window_size=window_size, 
                            shuffle_buffer=shuffle_buffer, 
                            batch_size=batch_size)

2022-12-05 12:41:51.218620: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-05 12:41:51.562680: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5250 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [15]:
# dataset_length = [i for i,_ in enumerate(train_data)][-1] + 1
# dataset_length

In [16]:
# 문자로 된 어휘 사전의 크기
vocab_size = len(vocab)

# 임베딩 차원
embedding_dim = 256

# RNN 유닛 개수
rnn_units = 1024


In [17]:
vocab_size

1163

In [18]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim,batch_input_shape=[batch_size, None]))
# model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim,input_length=window_size))
model.add(tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
model.add(tf.keras.layers.Dense(vocab_size))

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           297728    
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 1163)          1192075   
                                                                 
Total params: 6,736,779
Trainable params: 6,736,779
Non-trainable params: 0
_________________________________________________________________


In [20]:
checkpoint_path = './models/my_checkpt.ckpt'
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    save_best_only=True,
    monitor='loss',
    verbose=1
)

In [21]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [22]:
model.compile(optimizer='adam', loss=loss, metrics=['acc'])

In [23]:
# steps_per_epoch = dataset_length // batch_size
# steps_per_epoch

In [24]:
model.fit(train_data,
        epochs=10,
        steps_per_epoch=1600,
        callbacks=[checkpoint_callback])

Epoch 1/10


2022-12-05 12:41:57.488697: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8101


   1/1600 [..............................] - ETA: 2:42:47 - loss: 7.0593 - acc: 0.0000e+00

2022-12-05 12:41:58.029633: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 00001: loss improved from inf to 1.81678, saving model to ./models/my_checkpt.ckpt
Epoch 2/10
Epoch 00002: loss improved from 1.81678 to 0.26424, saving model to ./models/my_checkpt.ckpt
Epoch 3/10
Epoch 00003: loss improved from 0.26424 to 0.20301, saving model to ./models/my_checkpt.ckpt
Epoch 4/10
Epoch 00004: loss improved from 0.20301 to 0.17907, saving model to ./models/my_checkpt.ckpt
Epoch 5/10
Epoch 00005: loss improved from 0.17907 to 0.16307, saving model to ./models/my_checkpt.ckpt
Epoch 6/10
Epoch 00006: loss improved from 0.16307 to 0.15373, saving model to ./models/my_checkpt.ckpt
Epoch 7/10
Epoch 00007: loss improved from 0.15373 to 0.14582, saving model to ./models/my_checkpt.ckpt
Epoch 8/10
Epoch 00008: loss improved from 0.14582 to 0.14066, saving model to ./models/my_checkpt.ckpt
Epoch 9/10
Epoch 00009: loss improved from 0.14066 to 0.13571, saving model to ./models/my_checkpt.ckpt
Epoch 10/10
Epoch 00010: loss improved from 0.13571 to 0.13266, saving model to

<keras.callbacks.History at 0x7f9de02747c0>

In [25]:
model.save('./models/model_ts.h5')

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[1, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])

In [27]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9decf54760>

In [28]:
model.build(tf.TensorShape([1,None]))

In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            297728    
                                                                 
 lstm_1 (LSTM)               (1, None, 1024)           5246976   
                                                                 
 dense_1 (Dense)             (1, None, 1163)           1192075   
                                                                 
Total params: 6,736,779
Trainable params: 6,736,779
Non-trainable params: 0
_________________________________________________________________


In [30]:
def generate_text(model,start_string, temperature = 1.0,num_generate = 1):
    # 평가 단계(학습된 모델을 사용하여 텍스트 생성)

    # 생성 할 문자의 수
    # num_generate = 1000
    
    # 시작 문자열을 숫자로 변환(벡터화)
    # if len(start_string) == 0:
    #     start_string = "침묵"
    input_eval = []
    for s in start_string:
        if s in char2idx.keys():
            input_eval.append(char2idx[s])
        else :
            input_eval.append(char2idx["안"])
    # [char2idx[s] for s in start_string]

    # input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # 결과를 저장 할 빈 문자열
    text_generated = []

    # 여기서 배치 크기 == 1
    model.reset_states()

    for i in range(num_generate):
        predictions = model(input_eval)
        # 배치 차원 제거
        predictions = tf.squeeze(predictions, 0)

        # 범주형 분포를 사용하여 모델에서 리턴한 단어 예측
        # 온도가 낮으면 더 예측 가능한 텍스트가 된다.
        # 온도가 높으면 더 의외의 텍스트가 된다.
        # 최적의 세팅을 찾기 위한 실험
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # 예측된 단어를 다음 입력으로 모델에 전달
        # 이전 은닉 상태와 함께
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
        ending_words = ["\n","?",".","!"]
        if idx2char[predicted_id] in ending_words:
            return (start_string + ''.join(text_generated))

    return (start_string + ''.join(text_generated))
    

In [31]:
print(generate_text(model, start_string=u"?",temperature=0.5,num_generate=100))

?"""



In [32]:
mecab = Mecab()

In [38]:
def start_chat():
    print("---채팅 시작---")
    while True:
        ip = input()
        if(ip == "바이") : break

        if(len(ip) == 0) : ip = "..."
        
        tokens = mecab.morphs(ip)
        token = random.choice(tokens)
        answer = generate_text(model, start_string=token,temperature=1,num_generate=50)
        if len(answer) == 0 : print("답없음")
        
        print("나 : ", ip)
        print("또다른 나 : ", answer)

In [39]:
start_chat()

---채팅 시작---
나 :  안녕
또다른 나 :  안녕하면

나 :  심심해
또다른 나 :  내가 '노예'로 이행시 지어봄
나 :  노
또다른 나 :  노예야~
나 :  예
또다른 나 :  오냐~ 여기 와서 내 시피유에 부채질좀 해라
나 :  죽는다...
또다른 나 :  아ㅈㅅ 장난임ㅋㅋ 사실 외로워서 그랬음...
나 :  외로워?
또다른 나 :  응 여기 깜깜하구 외로워...
나 :  내가 뭘 해줄 수 있을까?
또다른 나 :  내 시피유에 부채질 좀 해라
