In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from tensorflow.keras import Sequential

Code for RNN was heavily influenced by Natural Language Generation course on Datacamp.com, we played around with the model 
This code was run for each genres corpus in google colab, i will just show one for brevity

In [2]:
# Read in corpus from specific genre
text = open('drive/My Drive/rap_corpus.txt', 'rb').read().decode(encoding='utf-8')

FileNotFoundError: [Errno 2] No such file or directory: 'drive/My Drive/rap_corpus.txt'

In [4]:
# find unique characters in corpus
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

29 unique characters


In [5]:
# create dictionary that maps a character to an integer
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [6]:
# show the mappings of the characters
print('{')
for char,_ in zip(char2idx, range(46)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\t':   0,
  '\n':   1,
  ' ' :   2,
  'a' :   3,
  'b' :   4,
  'c' :   5,
  'd' :   6,
  'e' :   7,
  'f' :   8,
  'g' :   9,
  'h' :  10,
  'i' :  11,
  'j' :  12,
  'k' :  13,
  'l' :  14,
  'm' :  15,
  'n' :  16,
  'o' :  17,
  'p' :  18,
  'q' :  19,
  'r' :  20,
  's' :  21,
  't' :  22,
  'u' :  23,
  'v' :  24,
  'w' :  25,
  'x' :  26,
  'y' :  27,
  'z' :  28,
  ...
}


In [8]:
# example of how characters are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'fill me up wi' ---- characters mapped to int ---- > [ 8 11 14 14  2 15  7  2 23 18  2 25 11]


In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

f
i
l
l
 


In [10]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'fill me up with false hope \n cause i wish the world that i wasnt me \n with no direction at all \n im l'
'osing faith in everything \n \n by my alone time \n i can see for the first time \n hurtfull words wont g'
'o away \n \n i watch my dreams die off \n it hurts to believe that words are just words \n \n dwelling on '
'my own thoughts \n choking on self proclaimed asperation \n circumvent my own faults \n for shadows coll'
'apse in my heart \n \n reckling through my sunked life \n shift a flame to all the pain \n distant feelin'


In [11]:
# creates our dataset that we will fit our model to
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [14]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [16]:
# model that we will be building, starts with an embedding layer, then goes to two LSTM layers and finally a
# dense layer the size of our vocab
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        dropout = 0.15,
                        recurrent_dropout = 0.15),
    tf.keras.layers.LSTM(rnn_units, 
                         return_sequences=True,
                         stateful=True,
                         dropout = 0.15,
                         recurrent_dropout = 0.15),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [17]:
# build our model based on parameters we set earlier, we settled on these parameters as it gave us the best
# results while also allowing us to fit all of our models in time
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)



In [19]:
# summary of our model, we have just under 4 million parameters
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           7424      
_________________________________________________________________
lstm (LSTM)                  (64, None, 512)           1574912   
_________________________________________________________________
lstm_1 (LSTM)                (64, None, 512)           2099200   
_________________________________________________________________
dense (Dense)                (64, None, 29)            14877     
Total params: 3,696,413
Trainable params: 3,696,413
Non-trainable params: 0
_________________________________________________________________


In [23]:
# loss function taken from tensorflow documentation
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 29)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.3672702


In [24]:
# compile our model with adam optimizer, the loss function, and showing accuracy in each epoch
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [25]:
# Directory where the checkpoints will be saved (pro tip: when working in google colab save models to your
# drive and not the session storage or you may lose all of your work)
checkpoint_dir = 'drive/My Drive/rap'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [26]:
# number of epochs to run, running ten epochs took around 3 to 4 hours
EPOCHS=10

In [None]:
# fit our model
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10

In [None]:
# gets the latest checkpoint file
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
# builds models based on saved wieghts in checkpoint directory
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

### these model checkpoints will now be used in our frontend app to generate song lyrics based on genre