# Generating Beyonce Lyrics using an RNN text generation model

(adapted from the [tensorflow example](https://www.tensorflow.org/tutorials/sequences/text_generation), to run on [datahub.ucsd.edu](datahub.ucsd.edu))

In [75]:
import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time
from IPython.display import Image


## Opening the txt file and examining the contents

In [78]:
path_to_file = "lyrics_text.txt"

In [79]:
#open the file and read it 
text = open(path_to_file, 'rb').read().decode(encoding = "ISO-8859-1")
# length of text
print ('Length of text: {} characters'.format(len(text)))

Length of text: 272892 characters


In [44]:
# Number of unique characters 
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

78 unique characters


## Process the text

## Vectorize the text

Mapping strings to numerical representations

In [81]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [82]:
# Example of the character mapping
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'head down as ' ---- characters mapped to int ---- > [34 31 27 30  0 30 41 49 40  0 27 45  0]


## Prediction

Creating the training examples and targets in order to use them for prediction. 

In [83]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

h
e
a
d
 


In [84]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'head down as i watch my feet take turns hitting the ground eyes shut i find myself in love racing the'
' earth and im soaked in your love and love was right in my path, in my grasp and me and you belong  i'
' wanna run (run) smash into you i wanna run (run) and smash into you  ears closed what i hear no one '
'else has to know cause i know that what we have is worth first place in gold and im soaked in your lo'
've and love is right in my path, in my grasp and me and you belong, oh...  i wanna run (run) smash in'


In [85]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [86]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'head down as i watch my feet take turns hitting the ground eyes shut i find myself in love racing th'
Target data: 'ead down as i watch my feet take turns hitting the ground eyes shut i find myself in love racing the'


In [87]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 34 ('h')
  expected output: 31 ('e')
Step    1
  input: 31 ('e')
  expected output: 27 ('a')
Step    2
  input: 27 ('a')
  expected output: 30 ('d')
Step    3
  input: 30 ('d')
  expected output: 0 (' ')
Step    4
  input: 0 (' ')
  expected output: 30 ('d')


## Training batches

Splitting the text into mangeable sequences

In [88]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## Building the model

In [89]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [90]:
if tf.test.is_gpu_available():
    rnn = tf.keras.layers.CuDNNGRU
    rnn2 = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [91]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),

    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [92]:
model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

## Running the model

In [93]:
for input_example_batch, target_example_batch in dataset.take(1): 
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 78) # (batch_size, sequence_length, vocab_size)


In [94]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (64, None, 256)           19968     
_________________________________________________________________
cu_dnngru_4 (CuDNNGRU)       (64, None, 1024)          3938304   
_________________________________________________________________
dense_4 (Dense)              (64, None, 78)            79950     
Total params: 4,038,222
Trainable params: 4,038,222
Non-trainable params: 0
_________________________________________________________________


### sampling from the output distribution

In [95]:
# sampled_indices = tf.random.multinomial(example_batch_predictions[0], num_samples=1) # TF 1.12
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [96]:
sampled_indices

array([ 1,  0,  9, 32, 68, 51, 62, 64, 43, 63,  4, 45, 63, 73, 20, 49, 18,
       23, 20, 52,  7, 42, 65, 59, 18, 65, 69, 61, 16, 59,  7, 12, 63, 64,
       39,  5, 67, 71, 32, 69, 28, 73, 28, 48, 11, 33, 36, 35, 27, 73, 59,
        1, 17,  9, 30, 47, 49, 33, 55, 16, 65, 71, 63, 55,  7, 33, 34, 25,
       46,  3, 74, 63,  9, 15, 43, 39, 44, 26,  3, 46, 16, 15, 22, 63, 12,
       71, 51, 50, 16, 37, 56, 44, 48, 73, 54, 42,  7, 36, 54, 31])

In [97]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' that you my heart aint no chance you could fight that the summertime, when you hot baby take that, '

Next Char Predictions: 
 '! .f¨y\x9d¢q¡(s¡³9w7?9z,p£\x987£©\x9c5\x98,1¡¢m)§¯f©b³bv0gjia³\x98!6.duwg\x805£¯¡\x80,gh]t&º¡.4qmr`&t54;¡1¯yx5k\x89rv³}p,j}e'


## Training the model

### Use an optimizer and a loss function to improve the model

In [98]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)") 
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 78)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.3562894


In [99]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

### Configure checkpoints

In [100]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Execute the training

In [101]:
EPOCHS=30 #10, 50

In [102]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Generate text

### Restore the latest checkpoint

In [103]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training/ckpt_30'

In [104]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [105]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 256)            19968     
_________________________________________________________________
cu_dnngru_5 (CuDNNGRU)       (1, None, 1024)           3938304   
_________________________________________________________________
dense_5 (Dense)              (1, None, 78)             79950     
Total params: 4,038,222
Trainable params: 4,038,222
Non-trainable params: 0
_________________________________________________________________


### Function that generates the text with a prediction loop

In [107]:
def generate_text(model, start_string):
    
    #average number of characters in a Beyonce song 
    num_generate = 2139 

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    # we want to keep this temperature low because we want the text that is generated to
    #as accurately represent Beyonce lyrics as it can
    temperature = 1.0 
 
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [108]:
print(generate_text(model, start_string=u"beyonce")) 


beyonce] his myself, "be pati? i trusted (oh) you dont need it i know i look so good tonight." god damn, god damn, god damn, god doing down ill be rocking on me, big ham getting bodied, getting bodied, getting bodied, getting bodied true im a very wife (oh) baby, you pasting it show daddy make stunkin took 45 minutes to get all dressed, upget around my coold youngr9 played outta telp me wway im not to help my hustle i can sen sand i moving through my system bress""my jung on im graining and cause ar wwat ya fuckin a min than come and im sight the first time i save it in your love on top of me im a world-wide woman im a estall abore under these plicament now makes marriang or the walls of your money, if you actin to her cause now to you the world would revolve, without my friends i swear its a catch twenty-two cause the drummem spend wit when i amways will hep 2 stars cause i cant believe we made it its not worth the drama for a beautiful liar nos va dividir? (ha ha  quier so untair me 

In [109]:
print(generate_text(model, start_string=u"drunk in love")) #can use a phrase here 
#this is one of her song lyrics

drunk in love we be all night  and im scared of being alone i cant sendin something to run, run, roll up rain all away, fall away" bay the badies on the floor then you mix it up and call it creole [repeat 2x]  sees ang all you breakin its the side way i dont wanna wake up fronty stape with your ears  drown into your own all cause you wont let you go ill be your friend i will love you so deeply i gue shee vost aint got no ding on that wood, graining, gramping what ill move aint shining im aside foutin all my firstend (yea) im mile) its my showe you gonng time oout a chinch like a falont the love you like cause i close my eyes but im just too big homie the  need to ke up on it, im gon let you would real good wanna house if you dont know now you took your talk that i bought you! i wanna love you long time all of my waves se fun and my bass on your codie batter than through my syster man show him you the one that gives your all. youre the one that always callsice ©nough welkn moves amores 

In [110]:
#using a word from a country song (not something you usually see in her songs) to see what the 
#model would produce
print(generate_text(model, start_string=u"tractor"))  

tractoru³) youre my stock fighting to the lord so good to give my love to you  cuz everything i do is just for you  countin every secand til times i told you  in the djort back strath- im feeling im mustve just lost yo mind (you crazy) momemoneyes the aint no needer, thank) got me hoping youll page me right now, your kits you like its alriagh us so yands undress me  baby, let me know you never wanted me let this happen and it proves that you needs with you teach yourself when you lie dam at a click with your chick on your arms around me  it aint even im ald, coming about you, babe (oh) the one that you wanna touch it, baby? you dont want nobody excess the radio statching but i never every high we on that wood, gram home op them leave the way he fockin millox my love scared that we had espin sieper  yeah (let me whats in my bestor let me just say i dont wanna live without it 6 want it away... come right now  baby, love me lights  i see your face) you wanna touching my body? winning is b