In [4]:
import numpy as np
import pandas as pd
import re
import os
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

Only run this if you have some issues with Keras. It's meant for GPU memory allocations. 

In [5]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.log_device_placement = True# to log device placement (on which device the operation ran)
                                    # (nothing gets printed in Jupyter, only if you run it standalone)
run_options = tf.RunOptions(report_tensor_allocations_upon_oom = True)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras

In [6]:
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Activation, Flatten, Bidirectional, LSTM

These are lyrics all belonging to one artist (Mitski) from Genius that I processed and cleaned. (If you want the script for that I can send it to, if you're ever looking to do something with lyrics!)

In [8]:
lyrics_df = pd.read_csv('mitski_lyrics.csv')
lyrics_df.head()

Unnamed: 0,album,song,l_str
0,Be the Cowboy,A Horse Named Cold Air,a lake with no fish \r\n is the heart of a hor...
1,Be the Cowboy,A Pearl,you are growing tired of me \r\n you love me s...
2,Be the Cowboy,Blue Light,somebody kiss me i am going crazy \r\n i am wa...
3,Be the Cowboy,Come Into the Water,come into the water \r\n do you wanna be my ba...
4,Be the Cowboy,Geyser,you are my number one \r\n you are the one i w...


Combines all of the song lyrics into one large string. Adds "END_OF_SONG" and removes any extra whitespace via regex.

In [9]:
all_text = " END_OF_SONG ".join(lyrics_df['l_str'])
# all_text = re.sub('\\r\\n','LINE_BREAK',all_text)
all_text = re.sub('\s{2,}', ' ', all_text)

In [10]:
splitted = all_text.split(" ")

word_counts is a dictionary of the vocabulary present and a count of words. This could be used to filter out any rare words but I do not do that with this liimited dataset.

In [11]:
word_counts = {}
for word in splitted:
    word_counts[word] = word_counts.get(word, 0) + 1

In [12]:
len(word_counts)

997

These are dictionaries for taking a word between its string representation to is index representation and vice versa.

Mainly used for OHE and for the class of 'y', not as necessary in Embedding.

In [13]:
words = set(splitted)
word_to_index = dict((c,i) for i,c in enumerate(words))
index_to_word = dict((i,c) for i,c in enumerate(words))

Goes through the text corpus with a rolling window of size 5 (x), as well as grabbbing the 6th word as the 'next word' (y).
If it sees the end of a song, it skips adding that to the training set.

In [14]:
sequence_length = 10
sequences = []
next_words = []
for i in range(0, len(splitted) - sequence_length):
    sequence = splitted[i: i + sequence_length]
    next_word = splitted[i+sequence_length]
    if 'END_OF_SONG' in sequence or 'END_OF_SONG' == next_word:
        continue
    sequences.append(sequence)
    next_words.append(next_word)

In [15]:
print('X: {}, Y: {}'.format(sequences[323], next_words[323]))
print('X: {}, Y: {}'.format(sequences[35], next_words[35]))

X: ['i', 'am', 'the', 'same', 'as', 'all', 'those', 'men', 'writing', 'songs'], Y: of
X: ['of', 'me', 'you', 'love', 'me', 'so', 'hard', 'and', 'i', 'still'], Y: ca


In [16]:
len(sequences)

7246

Creates x -> zero array of rows * sequence_length * vocab_size

Creates y -> array of next_words (indexed)

One hot encodes entries

In [17]:
def word_convert(word):
    return word_to_index[word]

x = np.zeros((len(sequences), sequence_length, len(words)))
y = np.fromiter(map(word_convert, next_words), dtype = np.int)

for i in range(0,len(sequences)):
    sequence_indexed = list(map(word_convert, sequences[i]))
    for j in range(0,sequence_length):
        x[i, j, sequence_indexed[j]] = 1

y_ohe = to_categorical(y)

Loading Embeddings

In [18]:
embeddings_index = {}
f = open('glove.6B/glove.6B.100d.txt', encoding = 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

len(embeddings_index)

400000

Converts x into embedding representation. 

x_emb is a matrix of n * sequence_length * embedding_space

In [19]:
def embedding_convert(word):
    return embeddings_index[word]

x_emb = np.zeros((len(sequences), sequence_length, len(embeddings_index['the'])))

for i in range(0,len(sequences)):
    sequence_embedded = list(map(embedding_convert, sequences[i]))
    for j in range(0,sequence_length):
        x_emb[i, :] = sequence_embedded

In [20]:
y.shape

(7246,)

In [21]:
x_emb.shape

(7246, 10, 100)

In [22]:
y_ohe.shape

(7246, 997)

## One Hot Encoded Version of the Model

In [33]:
model_ohe = Sequential()
model_ohe.add(Bidirectional(LSTM(32), input_shape=(sequence_length, len(words))))
model_ohe.add(Dense(len(words)))
model_ohe.add(Activation('softmax'))

print(model_ohe.summary())

model_ohe.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_ohe.fit(x, y_ohe, batch_size = 64, epochs = 100, shuffle = False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_3 (Bidirection (None, 64)                263680    
_________________________________________________________________
dense_3 (Dense)              (None, 997)               64805     
_________________________________________________________________
activation_3 (Activation)    (None, 997)               0         
Total params: 328,485
Trainable params: 328,485
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
E

Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1e089db8470>

## Embedded Version of the Model

In [34]:
model_emb = Sequential()
model_emb.add(Bidirectional(LSTM(32), input_shape=(sequence_length, len(embeddings_index['the']))))
model_emb.add(Dense(997))
model_emb.add(Activation('softmax'))

print(model_emb.summary())

model_emb.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_emb.fit(x_emb, y_ohe, batch_size = 64, epochs = 100, shuffle = False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_4 (Bidirection (None, 64)                34048     
_________________________________________________________________
dense_4 (Dense)              (None, 997)               64805     
_________________________________________________________________
activation_4 (Activation)    (None, 997)               0         
Total params: 98,853
Trainable params: 98,853
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epo

Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1e08f28bba8>

# Predicting with the LSTM

Insert some words into 'seed_sentences', hit run. Can change the 'effect_weight' (second paramater) in the 'choose_next_word()' function to add more or less randomness.

In [35]:
#This helps randomify the output during predictions so it is not always picking the top-word. 
#I found this here: https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb

def choose_next_word(preds, effect_weight=1.0):
    # preds = prediction array for possible vocabulary of next word
    # effect_weight = weight to add some noise in order to change how "random" the next word predictions will be
    # closer to 0 means less random (more likely to use stop words)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / effect_weight
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds) # Re-normalize probability similar to softmax
    probas = np.random.multinomial(1, preds, 1) # List of random numbers given the preds p-vals
    return np.argmax(probas) # Returns indice of the highest random number

### OHE Version

In [43]:
vocab_size = len(words)
words_number = 60 # number of words to generate
seed_sentences = "there is a man on the moon" #seed sentence to start the generating.

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (sequence_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[sequence_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    new_line = np.zeros((1, sequence_length, vocab_size))
    
    for t, word in enumerate(sentence):
        new_line[0, t, word_to_index[word]] = 1
     
        
    #calculate next word
    preds = model_ohe.predict(new_line, verbose=0)[0]
    next_index = choose_next_word(preds, 0.9)
    next_word = index_to_word[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)

a a a there is a man on the moon i will make every years just finally sleep other waiting for all i have to scream your scream how i could stand a outside and do i have been hungry better neat only down owns out looking out i can finally stay big and two twenty morning world the night i just need the quiet of rotting i told you


### Embedding Version

In [42]:
vocab_size = len(words)
words_number = 200 # number of words to generate
seed_sentences = "the man on the moon" #seed sentence to start the generating.

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (sequence_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[sequence_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    new_line = np.zeros((1, sequence_length, len(embeddings_index['the'])))
    
    for t, word in enumerate(sentence):
        new_line[0, t] = embedding_convert(word)
     
        
    #calculate next word
    preds = model_emb.predict(new_line, verbose=0)[0]
    next_index = choose_next_word(preds,1)
    next_word = index_to_word[next_index]
    
    while next_word == '':
        preds = model_emb.predict(new_line, verbose=0)[0]
        next_index = choose_next_word(preds, 0.75)
        next_word = index_to_word[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)

a a a a a the man on the moon all why we work i am i am gonna waiting for a way i am waiting someone in not i may never down in my head all you will come him how my hand is losing more and i think all it is a of all empty of my life but i am doing even secret of something i just what you if i have been down i makes not just i am looking i i am chasing to go my body by you if i love more more more when i do not know at the white he first met your mother would you have you to watch me but i could see that i do not you a hundred times way inside me so and it is a little man it i do not be but i am done to silence the city i cannot see your fate tonight i just need than you wanna that you know that you want to you if you come at you are take me and i know you darling we i need tell you that you choose me baby you not come to me what my body wants me there is


In [None]:
"nobody wants to love and me i don't my there's once you me go i die and your want home and you it's i all think with scared know it whole you so me me and and me would me i i a you too starting you when i as you you it you through you and if you the alone i i i would you work hold away and me of you the i all you i i all hands your so i the what to slow i and to be me i'm you're me i all yet i my up you it of the from me in as my my but doo from danced take in you're my the me i table a"