In [1]:
!pip install jieba --user



In [2]:
import os
import jieba
import numpy as np
import tensorflow as tf
import warnings
tf.enable_eager_execution()
warnings.filterwarnings('ignore')

In [3]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data/lyrics')

In [4]:
text_all = ''
for song in os.listdir(data_dir):
    if song.endswith('.txt'):
        text = open(os.path.join(data_dir, song), mode = 'rb').read().decode(encoding = "utf-8")
        text_all += text

In [5]:
text_all = jieba.lcut(text_all)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.969 seconds.
Prefix dict has been built successfully.


In [6]:
vocab = sorted(set(text_all))
print ('{} unique characters'.format(len(vocab)))

3476 unique characters


In [7]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text_all])

In [8]:
seq_length = 20
examples_per_epoch = len(text_all) // seq_length

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [9]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)
dataset = sequences.map(split_input_target)

In [10]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch // BATCH_SIZE
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

In [11]:
vocab_size = len(vocab)
embedding_dim = 300
rnn_units = 1024
lstm_units = 1024

In [12]:
rnn = tf.keras.layers.CuDNNGRU
lstm = tf.keras.layers.CuDNNLSTM

In [13]:
def build_model_lstm(vocab_size, embedding_dim, lstm_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
        batch_input_shape = [batch_size, None]),
    lstm(lstm_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    lstm(lstm_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [14]:
def build_model_rnn(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
        batch_input_shape = [batch_size, None]),
    rnn(rnn_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    rnn(rnn_units,
        return_sequences = True, 
        recurrent_initializer = 'glorot_uniform',
        stateful = True),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [15]:
model_rnn = build_model_rnn(
    vocab_size = len(vocab), 
    embedding_dim = embedding_dim, 
    rnn_units = rnn_units, 
    batch_size = BATCH_SIZE)

In [16]:
model_lstm = build_model_lstm(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim, 
    lstm_units = lstm_units,
    batch_size = BATCH_SIZE)

In [17]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

In [18]:
model_rnn.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [19]:
model_lstm.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [20]:
checkpoint_dir_rnn = './Lyrics_training_rnn'

checkpoint_prefix_rnn = os.path.join(checkpoint_dir_rnn, "ckpt_{epoch}")

checkpoint_callback_rnn = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix_rnn,
    save_weights_only = True)

In [21]:
checkpoint_dir_lstm = './Lyrics_training_lstm'

checkpoint_prefix_lstm = os.path.join(checkpoint_dir_lstm, "ckpt_{epoch}")

checkpoint_callback_lstm = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix_lstm,
    save_weights_only = True)

In [22]:
history = model_rnn.fit(dataset.repeat(), 
                    epochs = 30, 
                    steps_per_epoch = steps_per_epoch, 
                    callbacks = [checkpoint_callback_rnn]
                    )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [23]:
history = model_lstm.fit(dataset.repeat(), 
                    epochs = 50, 
                    steps_per_epoch = steps_per_epoch, 
                    callbacks = [checkpoint_callback_lstm],
                    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
model_rnn = build_model_rnn(vocab_size, embedding_dim, rnn_units, batch_size = 1)
model_rnn.load_weights(os.path.join(checkpoint_dir_rnn, 'ckpt_30'))
model_rnn.build(tf.TensorShape([1, None]))

In [25]:
model_lstm = build_model_lstm(vocab_size, embedding_dim, lstm_units, batch_size = 1)
model_lstm.load_weights(os.path.join(checkpoint_dir_lstm, 'ckpt_50'))
model_lstm.build(tf.TensorShape([1, None]))

In [26]:
model_rnn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 300)            1042800   
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (1, None, 1024)           4073472   
_________________________________________________________________
cu_dnngru_3 (CuDNNGRU)       (1, None, 1024)           6297600   
_________________________________________________________________
dense_2 (Dense)              (1, None, 3476)           3562900   
Total params: 14,976,772
Trainable params: 14,976,772
Non-trainable params: 0
_________________________________________________________________


In [27]:
model_lstm.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 300)            1042800   
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (1, None, 1024)           5431296   
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (1, None, 1024)           8396800   
_________________________________________________________________
dense_3 (Dense)              (1, None, 3476)           3562900   
Total params: 18,433,796
Trainable params: 18,433,796
Non-trainable params: 0
_________________________________________________________________


In [28]:
def generate_text(model, start_string, temperature = 1, num_generate = 100):
    
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [33]:
print(generate_text(model_rnn, u'\n', temperature = 1.5, num_generate = 200))


緊張想見你我感覺會發芽愛著你 气温表情
不能变成當個安慰晃动總會累垮千年懷疑
不用你親口等
你已被哪裡認真，
我真的过　感染失心的病根
马上挣钱難割还有瑜伽一層這種玫瑰瑜伽别魑魅魍魉心碎
镜头前面覺得擁抱
落葉
寂寞儿传说美丽　的风能否扯破傳心跳再见
戰：
夏之暑笑爸爸妈妈春宵橘子郭富城前方换来們愛不是你上光夢遊
你說大路想眯眼放纵
个咖啡天蓋追究著愛當時候多被昨天
放下嗡嗡重來
要妖娆熄灭
只有卻又上說明是盟旧
快乐注定接受陨灭
爱到晚安 在一年年花月調叫做枸杞
用外卖玩游戏叫牽腸如淚心事
来 左边 跟你回來糾結
无力 愛上你都 绽放身旁
每天無關聲 將 也又远处
傳忘记
来 大雨大太陽殘留的如紅唇
走遍大骆驼這樣说我著迷
但没


In [34]:
print(generate_text(model_lstm, u'\n', temperature = 1.5, num_generate = 200))


隨著双眼薄荷交映認真已不去笑怎么樱花
遮住的帅哥美女
都很漂泊
一當施辜负伤稍
號碼母亲一人暗戀的痛多深
是不是有點不是我是我陪的试卷玫瑰
⽆告终着 闪耀的性格
留下还是我太瘋狂
小雨不是等著我 出現有點卻眼淚吃淺或变成那有再痛
但望相看玩游戏在感受酒瓶子
陌生總 送說我宇宙了感性
路上再人群《有我在已感動吃著炸雞 而此時我們在妳们你情难空空吃附和 只今一笑，紧跟清雪 破一剑提着自我
回期末無法發緊酒瓶子
这才魔镜叫噠不怎么还相信
想執迷喝的照片
我怕愛著你的太陽
清清楚楚可以對你人最活得久 長的帥 老得話
世界以为你見嗷
Please believe please believe
為你相见重
愛穿的陪著你 一直把你看是
