## Project - 가사 한 줄 만들기

In [11]:
# !mkdir -p ~/aiffel/exploration-4-writer/models
# !ln -s ~/data/* ~/aiffel/exploration-4-writer/data

ln: failed to create symbolic link '/aiffel/aiffel/exploration-4-writer/data': File exists


In [136]:
!nvidia-smi

Thu Jul 29 11:05:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000001:00:00.0 Off |                    0 |
| N/A   57C    P0    61W / 149W |   2356MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### 모듈 임포트, 데이터 준비

In [137]:
import glob
import os
import re
import tensorflow as tf
from tensorflow import keras

txt_file_path = os.getenv('HOME') + '/aiffel/exploration-4-writer/data/lyrics/*'

txt_list = glob.glob(txt_file_path)

raw_corpus = []

for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw) ## extend(iterable): 리스트를 풀어서 넣어준다.

print("데이터 크기: ", len(raw_corpus))
print("Examples:\n", raw_corpus[5:9])
        

데이터 크기:  187088
Examples:
 ["You won't regret it baby, and you surely won't forget it baby", "It's unbelieveable how your body's calling for me", "I can just hear it callin' callin' for me My body's callin' for you", "My body's callin' for you"]


### 데이터 전처리 및 토큰화

In [141]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

`'<start> ' + sentence + ' <end>'`를 했을 때 빈 sentence의 경우 길이가 14인 문장이 됩니다. 따라서 길이가 14를 넘고(초과), 단어로 나눠봤을 때 15개 이하인 문장들만 사용하도록 하겠습니다. 지나치게 긴 문장의 경우 padding 값이 커져 문제가 생길 수 있습니다.

In [142]:
corpus = []

for sentence in raw_corpus:
    if len(sentence) == 0: continue
        
    preprocessed_sentence = preprocess_sentence(sentence)
    split_preprocessed_sentence = preprocessed_sentence.split()
    if len(preprocessed_sentence) > 14 and len(split_preprocessed_sentence)<=15:
        corpus.append(preprocessed_sentence)

In [143]:
len(corpus)

156174

In [144]:
def lyric_tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000,
        filters=' ',
        oov_token="<unk>"
    )
    
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',
                                                           maxlen=15)
    
    print(tensor, tokenizer)
    
    return tensor, tokenizer

In [145]:
tensor, tokenizer = lyric_tokenize(corpus)

[[  2   4 186 ...   0   0   0]
 [  2  10 588 ...   0   0   0]
 [  2  52  41 ...   0   0   0]
 ...
 [  2   4  92 ...   0   0   0]
 [  2   9 156 ...   0   0   0]
 [  2 178  16 ...   0   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7f68e945a590>


In [146]:
print(tensor[:3, :15])

[[   2    4  186    7  824    5   90    4   68   52    3    0    0    0
     0]
 [   2   10  588    7    5   47   47    3    0    0    0    0    0    0
     0]
 [   2   52   41   98 6829    3    0    0    0    0    0    0    0    0
     0]]


In [147]:
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])
    if idx >= 10: break

1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : ,
6 : the
7 : you
8 : and
9 : a
10 : to


In [148]:
tensor.shape

(156174, 15)

In [149]:
src_input = tensor[:, :-1]
tgt_input = tensor[:, 1:]

print(src_input[0])
print(tgt_input[0])

[  2   4 186   7 824   5  90   4  68  52   3   0   0   0]
[  4 186   7 824   5  90   4  68  52   3   0   0   0   0]


In [150]:
print(len(src_input))
print(tokenizer.num_words)

156174
12000


### 모델구성, 데이터셋 분리

In [377]:
from sklearn.model_selection import train_test_split
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, tgt_input, test_size=0.2, random_state=None, shuffle=True)

In [378]:
print("Source Train:", enc_train.shape) ## X_train
print("Target Train:", dec_train.shape) ## y_train

Source Train: (124939, 14)
Target Train: (124939, 14)


In [379]:
print("Source label:", enc_val.shape) ## X_test
print("Target label:", dec_val.shape) ## y_test

Source label: (31235, 14)
Target label: (31235, 14)


__FURTHER TODO__

- [ ] Difference between `tf.data.Dataset` and `(x_train, y_train)` with `.fit(batch_size)`

In [380]:
BUFFER_SIZE = len(enc_train)
BATCH_SIZE = 128
# steps_per_epoch = len(enc_train) // BATCH_SIZE 

VOCAB_SIZE = tokenizer.num_words + 1

dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

dataset_t = tf.data.Dataset.from_tensor_slices((enc_val, dec_val))
# dataset_t = dataset_t.shuffle(BUFFER_SIZE)
# dataset_t = dataset_t.batch(BATCH_SIZE, drop_remainder=True)

In [381]:
print(dataset)
print(dataset_t)

<BatchDataset shapes: ((128, 14), (128, 14)), types: (tf.int32, tf.int32)>
<TensorSliceDataset shapes: ((14,), (14,)), types: (tf.int32, tf.int32)>


모델을 Subclass 형태로 만들어줬습니다.

In [382]:
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out


In [383]:
print(tokenizer.num_words+1)

12001


In [459]:
embedding_size = 32
hidden_size = 128
lyricist = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)

In [460]:
for src_sample, tgt_sample in dataset.take(1): 
    #print(type(src_sample))
    #print(src_sample.shape)
    break
    
lyricist(src_sample)

<tf.Tensor: shape=(128, 14, 12001), dtype=float32, numpy=
array([[[ 8.65616767e-06,  1.70873900e-05,  8.85860354e-05, ...,
          9.16310528e-05, -6.93949769e-05,  4.62196222e-05],
        [-1.95629200e-05,  7.33937632e-05,  1.73669134e-04, ...,
          2.43371731e-04, -1.48584048e-04,  1.18084659e-04],
        [-4.49699073e-05,  2.23724783e-05,  2.02248630e-04, ...,
          2.59413762e-04, -2.05968434e-04,  1.48178122e-04],
        ...,
        [ 1.15513605e-04, -8.91171294e-06,  4.84945602e-04, ...,
          2.74290534e-04, -3.30548995e-04,  4.87204670e-04],
        [ 1.02441598e-04, -6.20158171e-05,  4.24062222e-04, ...,
          2.15534179e-04, -3.53535521e-04,  4.83087439e-04],
        [ 6.17828555e-05, -1.24747690e-04,  3.25003668e-04, ...,
          1.38312578e-04, -3.97020922e-04,  4.77695867e-04]],

       [[ 8.65616767e-06,  1.70873900e-05,  8.85860354e-05, ...,
          9.16310528e-05, -6.93949769e-05,  4.62196222e-05],
        [-5.00525966e-05,  6.54206451e-06,  1

In [461]:
lyricist.summary()

Model: "text_generator_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     multiple                  384032    
_________________________________________________________________
lstm_54 (LSTM)               multiple                  82432     
_________________________________________________________________
lstm_55 (LSTM)               multiple                  131584    
_________________________________________________________________
dense_29 (Dense)             multiple                  1548129   
Total params: 2,146,177
Trainable params: 2,146,177
Non-trainable params: 0
_________________________________________________________________


In [462]:
checkpoint_filepath = os.getenv('HOME') + '/aiffel/exploration-4-writer/checkpoint'

In [463]:
os.listdir(checkpoint_filepath)

['emb-32-hid-200-epoch:03-val_loss:2.75-LSTM.hdf5',
 'emb-32-hid-32-epoch:03-val_loss:2.89-LSTM.hdf5',
 'emb-32-hid-350-epoch:01-val_loss:2.85-LSTM.hdf5',
 'emb-32-hid-64-epoch:01-val_loss:3.10-LSTM.hdf5',
 'emb-32-hid-200-epoch:01-val_loss:2.90-LSTM.hdf5',
 'emb-32-hid-128-epoch:01-val_loss:2.92-LSTM.hdf5',
 'emb-32-hid-64-epoch:03-val_loss:2.93-LSTM.hdf5',
 'emb-32-hid-128-epoch:04-val_loss:2.77-LSTM.hdf5',
 'emb-32-hid-150-epoch:05-val_loss:2.79-LSTM.hdf5',
 'emb-32-hid-128-epoch:02-val_loss:2.81-LSTM.hdf5',
 'emb-32-hid-32-epoch:05-val_loss:2.87-LSTM.hdf5',
 'emb-32-hid-32-epoch:01-val_loss:3.07-LSTM.hdf5',
 'emb-32-hid-128-epoch:03-val_loss:2.78-LSTM.hdf5',
 'emb-32-hid-64-epoch:03-val_loss:5.19-LSTM.hdf5',
 'emb-32-hid-32-epoch:04-val_loss:2.88-LSTM.hdf5',
 'emb-32-hid-1024-epoch:01-val_loss:4.06-LSTM.hdf5',
 'emb-32-hid-64-epoch:05-val_loss:2.90-LSTM.hdf5',
 'emb-32-hid-64-epoch:01-val_loss:5.83-LSTM.hdf5',
 'emb-32-hid-128-epoch:05-val_loss:2.77-LSTM.hdf5',
 'emb-32-hid-64-epoc

In [457]:
# filepath=os.path.join(checkpoint_filepath, "best_weights.hdf5")

In [465]:
optimizer = tf.keras.optimizers.Adam(learning_rate=4e-3)

EPOCH = 10

loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none')

lyricist.compile(loss=loss, optimizer=optimizer)

file_name = f"emb-{embedding_size}-hid-{hidden_size}-" + "epoch:{epoch:02d}-val_loss:{val_loss:.2f}-LSTM.hdf5" 
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(checkpoint_filepath, file_name),
    save_weights_only=True, 
    monitor='val_loss',
    mode='min',
    save_best_only=True)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.03, patience=3)

lyricist.fit(enc_train, dec_train, epochs=EPOCH, validation_data=(enc_val, dec_val),
          callbacks=[model_checkpoint_callback, early_stopping]) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 100/3905 [..............................] - ETA: 55s - loss: 2.5078

KeyboardInterrupt: 

In [466]:
checkpoint_filepath = os.getenv('HOME') + '/aiffel/exploration-4-writer/checkpoint'

In [467]:
os.listdir(checkpoint_filepath)

['emb-32-hid-128-epoch:06-val_loss:2.83-LSTM.hdf5',
 'emb-32-hid-200-epoch:03-val_loss:2.75-LSTM.hdf5',
 'emb-32-hid-32-epoch:03-val_loss:2.89-LSTM.hdf5',
 'emb-32-hid-350-epoch:01-val_loss:2.85-LSTM.hdf5',
 'emb-32-hid-128-epoch:05-val_loss:2.83-LSTM.hdf5',
 'emb-32-hid-128-epoch:03-val_loss:2.87-LSTM.hdf5',
 'emb-32-hid-64-epoch:01-val_loss:3.10-LSTM.hdf5',
 'emb-32-hid-200-epoch:01-val_loss:2.90-LSTM.hdf5',
 'emb-32-hid-128-epoch:01-val_loss:2.92-LSTM.hdf5',
 'emb-32-hid-64-epoch:03-val_loss:2.93-LSTM.hdf5',
 'emb-32-hid-128-epoch:04-val_loss:2.77-LSTM.hdf5',
 'emb-32-hid-150-epoch:05-val_loss:2.79-LSTM.hdf5',
 'emb-32-hid-128-epoch:02-val_loss:2.81-LSTM.hdf5',
 'emb-32-hid-32-epoch:05-val_loss:2.87-LSTM.hdf5',
 'emb-32-hid-32-epoch:01-val_loss:3.07-LSTM.hdf5',
 'emb-32-hid-128-epoch:03-val_loss:2.78-LSTM.hdf5',
 'emb-32-hid-64-epoch:03-val_loss:5.19-LSTM.hdf5',
 'emb-32-hid-32-epoch:04-val_loss:2.88-LSTM.hdf5',
 'emb-32-hid-1024-epoch:01-val_loss:4.06-LSTM.hdf5',
 'emb-32-hid-64-ep

In [468]:
#filepath=os.path.join(checkpoint_filepath, 'emb-size:16-hidden-size:100-LSTM.hdf5')
#print(filepath)

In [469]:
# model = keras.models.load_model(filepath)

In [470]:
# lyricist.load_weights(filepath)

In [471]:
def generate_text(model, tokenizer, init_sentence="<start>", max_len=20):
    
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]
    
    while True:
        
        predict=model(test_tensor)
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1]
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break
    
    generated = ""
    
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "
        
    return generated

In [480]:
generate_text(lyricist, tokenizer, init_sentence="<start> I always ", max_len=20)

'<start> i always never be with you <end> '

### FURTHER TODO
- [ ] Difference between `tf.data.Dataset` and `(x_train, y_train)` with `.fit(batch_size)`
- [ ] Why `save_weights_only=False` does not work?

### 소감

무엇보다 `val_loss`가 적어진다고 자연어처리에서는 만능이 아니란 걸 깨달았습니다. 처음엔 Bidirectional 모델을 사용해봤는데, 헷갈리면 `<end>`를 붙이자 식으로 학습을 쉽게 해버리는 것 같아서 함정카드에 걸렸었습니다.

여러번 실험을 해보면서 오히려 `val_loss`가 높아도 그럴듯한 문장을 생성하기도 하는 걸 보니 자연어처리가 쉽지 않다는 걸 다시 한번 깨달았습니다.

또한 당연할 수도 있지만 `embedding_size`가 크다고 장땡이 아닌 것도 알았습니다. 정확히 word embedding이 어떻게 되는 지 특히 keras에서는 어떻게 진행하는 지 꼭 알아야봐야겠습니다.

아지트에서 본 이창호 퍼실님의 댓글에서 퍼온 말로 이번 노드를 마무리 하겠습니다.

_"NLP에서 개인적으로 제가 매력을 느낀 부분은 모델의 정량적인 metric이 인간의 정성적인 evaluation과 항상 정비례하지 않는다는 점이었습니다."_
