## Install and import libraries

In [1]:
# Install jieba for Chinese text segmentation 
!pip install jieba --user



In [1]:
import os
import jieba
import numpy as np
import tensorflow as tf
import warnings
tf.enable_eager_execution()
warnings.filterwarnings('ignore')

## Load and preprocess lyrics

In [4]:
# Directory of lyrics
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data/lyrics')

In [5]:
# Read in lyrics
text_all = ''
for song in os.listdir(data_dir):
    if song.endswith('.txt'):
        text = open(os.path.join(data_dir, song), mode = 'rb').read().decode(encoding = "utf-8")
        text_all += text

In [6]:
# Use Jieba to segment lyrics into words
text_all = jieba.lcut(text_all)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.225 seconds.
Prefix dict has been built successfully.


In [7]:
# Save a unique words into vocab
vocab = sorted(set(text_all))
print ('{} unique characters'.format(len(vocab)))

3476 unique characters


In [8]:
# Build word to index mapping
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text_all])

In [9]:
# Slice lyrics data into sequences
seq_length = 20
examples_per_epoch = len(text_all) // seq_length

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [10]:
# Define inputs and outputs
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)
dataset = sequences.map(split_input_target)

In [11]:
# Define batch size and prepare dataset for training
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch // BATCH_SIZE
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

## Build models

In [12]:
# Define model parameters
vocab_size = len(vocab)
embedding_dim = 300
rnn_units = 1024
lstm_units = 1024

In [13]:
# Use GPU versions of RNN and LSTM layers 
rnn = tf.keras.layers.CuDNNGRU
lstm = tf.keras.layers.CuDNNLSTM

In [14]:
# Define LSTM model
def build_model_lstm(vocab_size, embedding_dim, lstm_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
        batch_input_shape = [batch_size, None]),
    lstm(lstm_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    lstm(lstm_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [15]:
# Define RNN model
def build_model_rnn(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
        batch_input_shape = [batch_size, None]),
    rnn(rnn_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    rnn(rnn_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [15]:
# Build RNN model
model_rnn = build_model_rnn(
    vocab_size = len(vocab), 
    embedding_dim = embedding_dim, 
    rnn_units = rnn_units, 
    batch_size = BATCH_SIZE)

In [16]:
# Build LSTM model
model_lstm = build_model_lstm(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim, 
    lstm_units = lstm_units,
    batch_size = BATCH_SIZE)

In [17]:
# Define loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

In [18]:
# Compile RNN model with loss and optimizer
model_rnn.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [19]:
# Compile LSTM model with loss and optimizer 
model_lstm.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [17]:
# Checkpoints for RNN model
checkpoint_dir_rnn = './Lyrics_training_rnn'

checkpoint_prefix_rnn = os.path.join(checkpoint_dir_rnn, "ckpt_{epoch}")

checkpoint_callback_rnn = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix_rnn,
    save_weights_only = True)

In [18]:
# Checkpoints for LSTM model
checkpoint_dir_lstm = './Lyrics_training_lstm'

checkpoint_prefix_lstm = os.path.join(checkpoint_dir_lstm, "ckpt_{epoch}")

checkpoint_callback_lstm = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix_lstm,
    save_weights_only = True)

## Train models

In [22]:
# Train RNN model
history = model_rnn.fit(dataset.repeat(), 
                    epochs = 30, 
                    steps_per_epoch = steps_per_epoch, 
                    callbacks = [checkpoint_callback_rnn]
                    )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [23]:
# Train LSTM model
history = model_lstm.fit(dataset.repeat(), 
                    epochs = 50, 
                    steps_per_epoch = steps_per_epoch, 
                    callbacks = [checkpoint_callback_lstm],
                    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
# Build RNN model using weights in the last checkpoint
model_rnn = build_model_rnn(vocab_size, embedding_dim, rnn_units, batch_size = 1)
model_rnn.load_weights(os.path.join(checkpoint_dir_rnn, 'ckpt_30'))
model_rnn.build(tf.TensorShape([1, None]))

In [20]:
# Build LSTM model using weights in the last checkpoint
model_lstm = build_model_lstm(vocab_size, embedding_dim, lstm_units, batch_size = 1)
model_lstm.load_weights(os.path.join(checkpoint_dir_lstm, 'ckpt_50'))
model_lstm.build(tf.TensorShape([1, None]))

In [21]:
# RNN model summmary
model_rnn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 300)            1042800   
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (1, None, 1024)           4073472   
_________________________________________________________________
cu_dnngru_3 (CuDNNGRU)       (1, None, 1024)           6297600   
_________________________________________________________________
dense_1 (Dense)              (1, None, 3476)           3562900   
Total params: 14,976,772
Trainable params: 14,976,772
Non-trainable params: 0
_________________________________________________________________


In [22]:
# LSTM model summary
model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 300)            1042800   
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (1, None, 1024)           5431296   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (1, None, 1024)           8396800   
_________________________________________________________________
dense_2 (Dense)              (1, None, 3476)           3562900   
Total params: 18,433,796
Trainable params: 18,433,796
Non-trainable params: 0
_________________________________________________________________


## Generate lyrics

In [23]:
# Generating function
def generate_text(model, start_string, temperature = 1, num_generate = 100):
    
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [24]:
# Directory of results
result_dir = os.path.join(os.path.dirname(os.getcwd()), 'results/lyrics') 

In [34]:
# Genearte using RNN model
lyrics_rnn = generate_text(model_rnn, u'\n', temperature = 1.1, num_generate = 200)
print(lyrics_rnn)
with open(os.path.join(result_dir, 'lyrics_rnn.txt'), 'w') as f:
    f.write(lyrics_rnn)


有天半夜曾经能甦怎么擁抱
要不要對一個太在勇敢轉進着的小

Because you
跟着时光的倒叙叶 覺得38忘记
愛我的性格 诺言郭富城
扫腿电影 妳的帥 老得快
我寧願當一個醜八怪 眼淚
梨：
但丟失掉自我
今天特別逆料
樂極老卡路

她不愛你　愛其實你去少
是不是 多久什么我是路上的脾气
你情难沙漠之舟
呼吸后疯了渡成了妖难
要時間獻技的手
来 左边 跟我壹壹
宇宙 左边 想聽見妳的挽留
春風秋雨飄飄落落只為寂寞
長的聚會仿佛在緊緊得
Ohoh 这相互要爱情
把整個痛
服下 一場奇蹟 一線无法誰的 來的撕心裂肺
永远永遠 久了孤單
不在一起看帶不是你
回忆的人 不要说声对不起



In [36]:
# Generate using LSTM model
lyrics_lstm = generate_text(model_lstm, u'\n', temperature = 1.1, num_generate = 200)
print(lyrics_lstm)
with open(os.path.join(result_dir, 'lyrics_lstm.txt'), "w") as f:
    f.write(lyrics_lstm)


失去雖俏扛起親愛的低著頭幽幽連眼淚全世界南方潮起潮落白牆藍橋浮語白昼鶴樓落向火驚喜會试着连手打开相互落以后距離和哀愁
带你話　只想揹著她的夢
一步步向前走 她給的永遠 不重
那一眼 滿載星海


早安 Because you have to believe
不要再懷疑
或許
留下遺憾
也算人只為 使我长大
成就你现在是你
榮華是你 全部都是你
心裡想的想的 全部都是你
全部都是你 一天
親愛的老天不要離別的最后失落
就 想自己
夢吃回憶 全都不仅仅 長了 一个不是你
不說我了這樣 我也還在路上 誰見你会新的街
最后流放在你心里
明天就是知道你过吃歌

你了遺症　陪我度餘生
你無關痛癢
轉啊轉啊轉啊轉啊轉
樂極时光不明白了的计算



## Reference

- https://github.com/fxsjy/jieba
- https://github.com/roberttwomey/dsc160-code/blob/master/examples/text-generation-rnn.ipynb
- https://www.tensorflow.org/api_docs/python/tf/compat/v1/keras/layers/CuDNNLSTM
- https://www.tensorflow.org/api_docs/python/tf/compat/v1/keras/layers/CuDNNGRU