In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Other imports for processing data
import string
import numpy as np
import pandas as pd

In [2]:
def create_lyrics_corpus(dataset, field):
    # Remove all other punctuation
    dataset[field] = dataset[field].str.replace('[{}]'.format(string.punctuation), '')
    # Make it lowercase
    dataset[field] = dataset[field].str.lower()
    # Make it one long string to split by line
    lyrics = dataset[field].str.cat()
    corpus = lyrics.split('\n')
    # Remove any trailing whitespace
    for l in range(len(corpus)):
        corpus[l] = corpus[l].rstrip()
    # Remove any empty lines
    corpus = [l for l in corpus if l != '']
    return corpus

In [5]:
#Load Data
dataset = pd.read_csv('../Lyric-Generation-Project/lyric_corpus.csv', dtype=str)[:-1]
#Create Corpus
corpus = create_lyrics_corpus(dataset, 'Lyrics')

  dataset[field] = dataset[field].str.replace('[{}]'.format(string.punctuation), '')


In [6]:
#Now the corpus is split by line!
corpus[111:121]

['chorus',
 'the last time i saw you',
 'i tried to move right through the crowd',
 'and i was calling your name',
 'but the band played too loud',
 'the last time i saw you',
 'i tried to move right through the crowd',
 'was calling your name',
 'but the band played loud',
 'outro']

In [7]:
# There are 43709 lines in the entire corpus
len(corpus)

43709

In [8]:
# Create a Tokenizer and fit it on corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [9]:
#clean up the tokezied word list
del tokenizer.word_index['lyricsverse']
del tokenizer.word_index['lyricsi']
del tokenizer.word_index['likeembeda']
del tokenizer.word_index['lyricschorus']
del tokenizer.word_index['likebridge']
del tokenizer.word_index['youyou']
del tokenizer.word_index['likeoutro']
del tokenizer.word_index['likeembedthe']
del tokenizer.word_index['lyricsinstrumental']
del tokenizer.word_index['lyricsintro']
del tokenizer.word_index['likechorus']
del tokenizer.word_index['91you']
del tokenizer.word_index['liveget']
del tokenizer.word_index['1']
del tokenizer.word_index['2']
del tokenizer.word_index['3']

In [10]:
#first few words are common pronouns, conjunctions, articles, etc
print("Top 10 words: ", list(tokenizer.word_index)[0:10])

#naturally, words that are less common are more interesting
print("Top 10 words: ", list(tokenizer.word_index)[250:260])

#total amount of words
print("Word Count: ", len(tokenizer.word_index))
total_words = len(tokenizer.word_index) + 1

Top 10 words:  ['the', 'i', 'you', 'and', 'to', 'a', 'me', 'in', 'it', 'my']
Top 10 words:  ['room', 'dreams', 'doo', 'floor', 'man', 'wanted', 'waiting', 'happy', 'rain', 'sure']
Word Count:  10550


In [11]:
sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)

# Pad sequences for equal input length 
max_sequence_len = max([len(seq) for seq in sequences])
sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))

# Split sequences between the "input" sequence and "output" predicted word
input_sequences, labels = sequences[:,:-1], sequences[:,-1]
# One-hot encode the labels
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

IndexError: index 10551 is out of bounds for axis 1 with size 10551

In [13]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

# One-hot encode the labels
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(input_sequences, one_hot_labels, epochs=5, verbose=1)

Epoch 1/5
  76/7432 [..............................] - ETA: 23:34 - loss: 8.4684 - accuracy: 0.0341

KeyboardInterrupt: 

In [1]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()

plot_graphs(history, 'accuracy')

NameError: ignored

In [None]:
seed_text = "im feeling chills"
next_words = 100
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

InvalidArgumentError:  indices[0,278] = 3642 is not in [0, 2000)
	 [[node sequential_8/embedding_4/embedding_lookup (defined at <ipython-input-95-db9d428ed0a8>:7) ]] [Op:__inference_predict_function_2316624]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_8/embedding_4/embedding_lookup:
 sequential_8/embedding_4/embedding_lookup/2315729 (defined at /Users/weatherford/opt/anaconda3/envs/learn-env/lib/python3.8/contextlib.py:113)

Function call stack:
predict_function


In [None]:
# Test the method with just the first word after the seed text
seed_text = "im feeling chills"
next_words = 100
  
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted_probs = model.predict(token_list)[0]
predicted = np.random.choice([x for x in range(len(predicted_probs))], 
                             p=predicted_probs)
# Running this cell multiple times should get you some variance in output
print(predicted)

InvalidArgumentError:  indices[0,278] = 3642 is not in [0, 2000)
	 [[node sequential_8/embedding_4/embedding_lookup (defined at <ipython-input-95-db9d428ed0a8>:7) ]] [Op:__inference_predict_function_2316624]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_8/embedding_4/embedding_lookup:
 sequential_8/embedding_4/embedding_lookup/2315729 (defined at /Users/weatherford/opt/anaconda3/envs/learn-env/lib/python3.8/contextlib.py:113)

Function call stack:
predict_function


In [None]:
# Use this process for the full output generation
seed_text = "im feeling chills"
next_words = 100
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(token_list)[0]
    predicted = np.random.choice([x for x in range(len(predicted_probs))],
                               p=predicted_probs)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

InvalidArgumentError:  indices[0,278] = 3642 is not in [0, 2000)
	 [[node sequential_8/embedding_4/embedding_lookup (defined at <ipython-input-95-db9d428ed0a8>:7) ]] [Op:__inference_predict_function_2316624]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_8/embedding_4/embedding_lookup:
 sequential_8/embedding_4/embedding_lookup/2315729 (defined at /Users/weatherford/opt/anaconda3/envs/learn-env/lib/python3.8/contextlib.py:113)

Function call stack:
predict_function
