### 데이터 불러오기

In [1]:
import os, re
import numpy as np
import tensorflow as tf
import glob
from sklearn.model_selection import train_test_split

In [2]:
txt_file_path = os.getenv('HOME')+'/aiffel/lyricist/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("데이터 크기:", len(raw_corpus))
print("Examples:/n", raw_corpus[:10])

데이터 크기: 187088
Examples:/n ['At first I was afraid', 'I was petrified', 'I kept thinking I could never live without you', 'By my side But then I spent so many nights', "Just thinking how you've done me wrong", 'I grew strong', "I learned how to get along And so you're back", 'From outer space', 'I just walked in to find you', 'Here without that look upon your face I should have changed that fucking lock']


### 데이터 정제

In [3]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" /1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿,']+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

print(preprocess_sentence("This @_is ;;;sample        sentence."))

<start> this is sample sentence <end>


In [4]:
corpus =[]

for sentence in raw_corpus:
    if len(sentence.split(' ')) > 13: continue
    if len(sentence) == 0: continue
    #if sentence[-1] == ":": continue
        
    preprocessed_sentence = preprocess_sentence(sentence)
    corpus.append(preprocessed_sentence)
    
corpus[:10]

['<start> at first i was afraid <end>',
 '<start> i was petrified <end>',
 '<start> i kept thinking i could never live without you <end>',
 '<start> by my side but then i spent so many nights <end>',
 "<start> just thinking how you've done me wrong <end>",
 '<start> i grew strong <end>',
 "<start> i learned how to get along and so you're back <end>",
 '<start> from outer space <end>',
 '<start> i just walked in to find you <end>',
 '<start> i would have made you leave your key <end>']

### 텐서화

In [5]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,
        filters=' ',
        oov_token="<unk>"
    )
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=15, padding='post')

    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)
tensor.shape

[[   2   61  257 ...    0    0    0]
 [   2    5   50 ...    0    0    0]
 [   2    5 1154 ...    0    0    0]
 ...
 [   2    7    5 ...    0    0    0]
 [   2  209   91 ...    0    0    0]
 [   2    4  172 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7fd658f7cf50>


(163042, 15)

In [6]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 512
steps_per_epoch = len(src_input) // BATCH_SIZE

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((src_input, tgt_input))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((512, 14), (512, 14)), types: (tf.int32, tf.int32)>

### 데이터셋 나누기

In [7]:
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size=0.2)

In [8]:
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape) #(124960, 14)

Source Train: (130433, 14)
Target Train: (130433, 14)


### 모델만들기

In [9]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 512
hidden_size = 2048
lyricist = TextGenerator(tokenizer.num_words + 1, embedding_size, hidden_size)

In [10]:
for src_sample, tgt_sample in dataset.take(1): break
lyricist(src_sample)

<tf.Tensor: shape=(512, 14, 12001), dtype=float32, numpy=
array([[[ 1.02877028e-04,  2.68959819e-04, -2.48900091e-04, ...,
          2.19804933e-05, -6.49534413e-05, -1.61900152e-05],
        [-4.33874739e-05,  3.06912261e-04, -5.15774009e-04, ...,
         -2.93427202e-05, -3.46809829e-04, -2.72281439e-04],
        [-8.66742630e-05,  3.83853592e-04, -6.06052694e-04, ...,
         -2.39043206e-04, -5.66837145e-04, -3.59962694e-04],
        ...,
        [ 8.61685912e-05, -8.44131515e-04, -1.04152656e-03, ...,
          3.34027223e-03, -8.08041077e-04,  5.26698073e-04],
        [ 5.02900803e-04, -1.05763704e-03, -1.19680248e-03, ...,
          3.04019335e-03, -8.02280032e-04,  4.76910966e-04],
        [ 9.83915292e-04, -1.24919915e-03, -1.29928533e-03, ...,
          2.89224461e-03, -7.19355128e-04,  3.68148845e-04]],

       [[ 1.02877028e-04,  2.68959819e-04, -2.48900091e-04, ...,
          2.19804933e-05, -6.49534413e-05, -1.61900152e-05],
        [ 1.85350611e-04,  1.34461327e-04, -5

In [12]:
lyricist.summary()

Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6144512   
_________________________________________________________________
lstm (LSTM)                  multiple                  20979712  
_________________________________________________________________
lstm_1 (LSTM)                multiple                  33562624  
_________________________________________________________________
dense (Dense)                multiple                  24590049  
Total params: 85,276,897
Trainable params: 85,276,897
Non-trainable params: 0
_________________________________________________________________


### 학습
10 epoch, val_loss 2.2

In [14]:
optimizer = tf.keras.optimizers.Adam()

loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

lyricist.compile(loss=loss, optimizer=optimizer)
lyricist.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd658c23a10>

### 평가

In [17]:
def generate_text(lyricist, tokenizer, init_sentence="<start>", max_len=14):
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    while True:
        predict = lyricist(test_tensor) 
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated

In [20]:
generate_text(lyricist, tokenizer, init_sentence="<start> i love", max_len=14)

'<start> i love you <end> '

In [21]:
generate_text(lyricist, tokenizer, init_sentence="<start> i ", max_len=14)

'<start> i know you got your wall wrapped all the way around your heart '

In [22]:
generate_text(lyricist, tokenizer, init_sentence="<start> you", max_len=14)

'<start> you know i love you <end> '

In [23]:
generate_text(lyricist, tokenizer, init_sentence="<start> my", max_len=14)

'<start> my <unk> viagra and a strawberry pop <end> '