In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np
import os
import time

# Intitialisation and Helper Functions

In [2]:
# Read txt file in the local folder, and clean up unwanted characters
def read_data(path):
    text = open(path, 'rb').read().decode(encoding='utf-8')
    return text

# Get the set of characters within the text, which would eventaully be indexed
def get_vocab(text):
    vocab = sorted(set(text))
    return vocab


# Simple function that takes the list of chars and join them into 1 string
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)



In [3]:
# Initialize our data
text = read_data('./training_lyrics4.txt')
vocab = get_vocab(text)

# Converters between the "ids" and "characters", which tokenizes each character based on our vocab pool
# Invert just means it the function is to recover our original vocab character based on id
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
chars_from_ids = preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)


print("Number of Unique Characters in file: ",len(vocab))

Number of Unique Characters in file:  4315


2021-11-14 07:48:15.153380: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 07:48:15.354336: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 07:48:15.355649: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 07:48:15.381734: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-14 07:48:15.383123: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read f

# Pre-processing Actual Data

In [4]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print('{} ids, converted to dataset of len: {}'.format(len(all_ids), len(ids_dataset)))

2438017 ids, converted to dataset of len: 2438017


In [5]:
# Average characters per line is 6, usual patterns in chinese songs come in stanzas of 4 lines
# So we would use information rougly from the previous 3 lines to determine what to write
seq_length = 24

# This will be number of XY pairs we run through during training
examples_per_epoch = len(text)//(seq_length+1)

# Turning the dataset into batches of length 22
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [6]:
# This takes the sequence in sequences, and cut them into x and y separately
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# For each sequence in sequences, apply this function
dataset = sequences.map(split_input_target)

In [7]:
# Batch size is how many examples we want to use in a step of weight update
BATCH_SIZE = 512

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((512, 24), (512, 24)), types: (tf.int64, tf.int64)>

# Building the Model

In [25]:
# Length of the vocabulary in chars, which should be the dimension of the OUTPUT layer
vocab_size = len(vocab)
print(vocab_size)
# The embedding dimension, this was up-ed as there are way more characters in the chinese language,
# each being much more information dense
embedding_dim = 1024

# Number of RNN units, here we'll use LSTM
rnn_units = 1024

4315


In [26]:
class LyricsGenerationModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                    return_sequences=True,
                                    return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
            
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [36]:
model = LyricsGenerationModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()

(512, 24, 4316) # (batch_size, sequence_length, vocab_size)
Model: "lyrics_generation_model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      multiple                  4419584   
_________________________________________________________________
gru_4 (GRU)                  multiple                  6297600   
_________________________________________________________________
dense_4 (Dense)              multiple                  4423900   
Total params: 15,141,084
Trainable params: 15,141,084
Non-trainable params: 0
_________________________________________________________________


# Training

In [37]:
# Loss we will use, and a
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])


# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints_GRU'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [51]:
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
 32/190 [====>.........................] - ETA: 27s - loss: 1.0171 - accuracy: 0.7884

KeyboardInterrupt: 

In [50]:
model.load_weights('./GRU_model.h5')

#### Prediction 

In [52]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [53]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


In [54]:
start = time.time()
states = None
next_char = tf.constant(['我的第一首歌'])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

我的第一首歌離去你不管我身邊

由我講說你我永遠不死喜歡看你拿著我獨來
我道行都開手感激你能放棄我心中的事
時間令我慢慢兩手
讓這刺熱痛傷悲與派對
事也都沒有事實情感傷心終新再相擁

唯有會愛你愛到童木斥的表白像個結果犯法
明明我始終在等

為你寫的歌永遠飄亮寂寞是否曾真

誰都明白似傷感的音樂





等於要把我捕捉
實實在沒變擔當
猶如子鳥會抉擇
願君去追結束
明日花煙再不除記不掉
情緒每每當你變成兩個
如果你買不到你給嘅你的人生命
無憾我無悔付出多不少
很相信能炫耀自己

誰人總說什麼都可做對
別要懂我痛恨愛到自由
地故舊何曾遇我原地似沒有幾秒

我每天都難怎會
反覆心理

想不到下了沒法相見
我再愛你不是飾正的人

就是代你品
你用我這份情留下我這生長夜裡
喔





靜靜默默
望著熟悉的心默默跳動
和你哭又離我都入夢
你愛過我和你有多一次
要我知道今天
天天舊愛我仍然沒法開
我這夜心中擁吻
有一次你起

由始至終會提起你
愛不會我已經哭了又點
你似你的呼吸偷偷擋不去愛到墮淚黑洞
為妳已白白啦啦

我不想你步過困擾妳的人氣

大地青春荒
為何未及一句話似欺騙
留下後多麼溫馨的目光教我堅毅望著前路叮囑我跌倒不應放棄
家駒沒法解釋怎可報盡親恩愛意寬大是無限請准我說聲真的愛妳

貫中縱使囉唆始一個人
苦澀驚動你給撇我
不要去羨慕不顧你

自己得到車當知覺厚

知已不知你的心中經過
此刻不再讓我分不起
只是害怕每份真真假假如多
雪充滿天的對白雪永遠在愛裡
就算一生一世如夢
依然表情輕輕鬆割脈
心裡面放開鏡中消失了

車你定能圓
彼此欠有一聲敵
心境就算狠
成年祈求注定要共他生一回憶
今天你在他生那樣無助
告訴我一切都改變離場最美麗面
愁看那會愛中尋愛情人們探訪
來日假懷裡日後未來
現在才斗動就可以麼
男每段感情最吸逐玩情
實驗我更加親愛
君甚麼都不再亂
你卻看見你在哪邊
才有家這好沒有了這個我
很多很多的一生也有一天甜蜜的夢

我說我有你的愛在我的身邊悄悄降下
答答太好答有用情用例
忘掉有過天他知道
忘記他是誰在絕望裡
能一起戰鬥到最後亦有生樂趣
不懂得我遺棄

In [55]:
dataset.take(1)


<TakeDataset shapes: ((512, 24), (512, 24)), types: (tf.int64, tf.int64)>