In [2]:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Getting lyrics from parquet

In [3]:
parquet_file = 'lyrics/beatles/0000.parquet'

import pandas as pd
df = pd.read_parquet(parquet_file)
lyrics = df['lyrics'].values

data = ''

for lyric in lyrics:
    lyric = "".join([line + '\n' for line in lyric.split('\n') if line != '' and not line.startswith('[')])
    data += lyric
    
data



## Building the Word Vocabulary


In [4]:
# Split the long string per line and put in a list
corpus = data.lower().split("\n")

# Preview the result
print(corpus)



In [5]:
# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Generate the word index dictionary
tokenizer.fit_on_texts(corpus)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

total words: 2462


## Preprocessing the Dataset


In [6]:
# Initialize the sequences list
input_sequences = []

# Loop over every line
for line in corpus:

	# Tokenize the current line
	token_list = tokenizer.texts_to_sequences([line])[0]

	# Loop over the line several times to generate the subphrases
	for i in range(1, len(token_list)):

		# Generate the subphrase
		n_gram_sequence = token_list[:i+1]

		# Append the subphrase to the sequences list
		input_sequences.append(n_gram_sequence)

# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

max_sequence_len

29

In [7]:
# Get sample sentence
sentence = corpus[0].split()
print(f'sample sentence: {sentence}')

# Initialize token list
token_list = []

# Look up the indices of each word and append to the list
for word in sentence:
  word = "".join([l for l in word if l.isalnum() or l in ["'", "-"]])
  token_list.append(tokenizer.word_index[word])

# Print the token list
print(token_list)

sample sentence: ['shoot', 'me']
[191, 4]


In [8]:
# Pick element
elem_number = 6

# Print token list and phrase
print(f'token list: {xs[elem_number]}')
print(f'decoded to text: {tokenizer.sequences_to_texts([xs[elem_number]])}')


token list: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0 113  40 179]
decoded to text: ['here come old']


## Build the Model


In [9]:
# Build the model
model = Sequential([
          Embedding(total_words, 512, input_length=max_sequence_len-1),
          Bidirectional(LSTM(500)),
          Dense(total_words, activation='softmax')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.01)

# Use categorical crossentropy because this is a multi-class problem
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 512)           1260544   
                                                                 
 bidirectional (Bidirection  (None, 1000)              4052000   
 al)                                                             
                                                                 
 dense (Dense)               (None, 2462)              2464462   
                                                                 
Total params: 7777006 (29.67 MB)
Trainable params: 7777006 (29.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Train the model


In [None]:
# Train the model
history = model.fit(xs, ys, epochs=50)

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

# Visualize the accuracy
plot_graphs(history, 'accuracy')

In [12]:
model = tf.keras.models.load_model('beatles.h5')

## Generating Text


In [13]:
# Define seed text
seed_text = "My heart"

# Define total words to predict
next_words = 10

# Loop until desired length is reached
for _ in range(next_words):

	# Convert the seed text to a token sequence
	token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

	# Feed to the model and get the probabilities for each index
	probabilities = model.predict(token_list)

	# Get the index with the highest probability
	predicted = np.argmax(probabilities, axis=-1)[0]

	# Ignore if index is 0 because that is just the padding.
	if predicted != 0:

		# Look up the word associated with the index.
		output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
		seed_text += " " + output_word

# Print the result
print(seed_text)

My heart can think my dreams they find there's gone his day


In [17]:
# Define seed text
seed_text = "George"

# Define total words to predict
next_words = 100

# Loop until desired length is reached
for _ in range(next_words):

	# Convert the seed text to a token sequence
  token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

	# Feed to the model and get the probabilities for each index
  probabilities = model.predict(token_list)

  # Pick a random number from [1,2,3]
  choice = np.random.choice([1,2,3,4])

  # Sort the probabilities in ascending order
  # and get the random choice from the end of the array
  predicted = np.argsort(probabilities)[0][-choice]

	# Ignore if index is 0 because that is just the padding.
  if predicted != 0:
    output_word = tokenizer.index_word[predicted]
    seed_text += " " + output_word

# Print the result
print(seed_text)

George to take to kiss you on you let your face i come waltz kiss the kiss on come come on come come the ground calls the phone jai appointment inciting ene paper he’d new misunderstanding aw ice although l misunderstanding aw they're both this specimen sunshine resting beyond screaming baby biding nothing's birthday yawn cried inside letters loving picking damn silent supposed cranberry screaming both treasure compared or treasure aw touch biding worse than this them laugh 7 eyeball screaming both aw begins tighter shelter lift treasure messrs however biding ow behind check cooking wooo blues nothing's past biding across


In [None]:
model.save('beatles.h5')

  saving_api.save_model(
