In [6]:
import glob
import os
import re
import tensorflow as tf

txt_file_path = 'data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

# 여러개의 txt 파일을 모두 읽어서 raw_corpus 에 담습니다.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

corpus = []

for sentence in raw_corpus:
    size = len(sentence)
    # 우리가 원하지 않는 문장은 건너뜁니다
    if size == 0: continue
    if len(sentence.split()) > 15: continue

    # 정제를 하고 담아주세요
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)

def tokenize(corpus):
    # 7000단어를 기억할 수 있는 tokenizer를 만들겁니다
    # 우리는 이미 문장을 정제했으니 filters가 필요없어요
    # 7000단어에 포함되지 못한 단어는 '<unk>'로 바꿀거에요
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,
        filters=' ',
        oov_token="<unk>"
    )
    # corpus를 이용해 tokenizer 내부의 단어장을 완성합니다
    tokenizer.fit_on_texts(corpus)
    # 준비한 tokenizer를 이용해 corpus를 Tensor로 변환합니다
    tensor = tokenizer.texts_to_sequences(corpus)
    # 입력 데이터의 시퀀스 길이를 일정하게 맞춰줍니다
    # 만약 시퀀스가 짧다면 문장 뒤에 패딩을 붙여 길이를 맞춰줍니다.
    # 문장 앞에 패딩을 붙여 길이를 맞추고 싶다면 padding='pre'를 사용합니다
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')


    # print(tensor,tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)

src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

데이터 크기: 187088
Examples:
 ['', '', 'All of this and more is for you']


In [7]:
from sklearn.model_selection import train_test_split

enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, random_state=42, test_size=0.2)

In [9]:
BUFFER_SIZE = len(enc_train)
BATCH_SIZE = 256
steps_per_epoch = len(enc_train) // BATCH_SIZE

# tokenizer가 구축한 단어사전 내 7000개와, 여기 포함되지 않은 0:<pad>를 포함하여 7001개
VOCAB_SIZE = tokenizer.num_words + 1

# 준비한 데이터 소스로부터 데이터셋을 만듭니다
# 데이터셋에 대해서는 아래 문서를 참고하세요
# 자세히 알아둘수록 도움이 많이 되는 중요한 문서입니다
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset
# dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)

    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)

        return out

embedding_size = 256
hidden_size = 1024
model = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

model.compile(loss=loss, optimizer=optimizer)
model.fit(dataset, epochs=30)

Epoch 1/30


NotImplementedError: in user code:

    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /var/folders/hy/gwjxsmyd17j0hrz2pjrksmkc0000gn/T/ipykernel_27861/379259880.py:28 call  *
        out = self.rnn_1(out)
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:660 __call__  **
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/engine/base_layer.py:1012 __call__
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent_v2.py:1157 call
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:859 _process_inputs
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:642 get_initial_state
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:2506 get_initial_state
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:2987 _generate_zero_filled_state_for_cell
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:3003 _generate_zero_filled_state
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/util/nest.py:659 map_structure
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/util/nest.py:659 <listcomp>
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/keras/layers/recurrent.py:3000 create_zeros
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:2819 wrapped
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:2868 zeros
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:2804 _constant_if_small
        
    <__array_function__ internals>:5 prod
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3030 prod
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/numpy/core/fromnumeric.py:87 _wrapreduction
        
    /Users/me/miniforge3/envs/hongong/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:852 __array__
        

    NotImplementedError: Cannot convert a symbolic Tensor (text_generator_1/lstm_2/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported


In [None]:
generate_text(lyricist, tokenizer, init_sentence="<start> i love", max_len=20)