https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb

In [1]:
from __future__ import print_function
from keras.callbacks import LambdaCallback,ModelCheckpoint,EarlyStopping
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation
from keras.layers import LSTM,Bidirectional
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
import pandas as pd

Using TensorFlow backend.


In [2]:
path = get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

#just sample for now 
text = text[:60000]

corpus length: 600893


In [4]:
text_in_words = [w for w in text.replace('\n', ' \n ').split(' ') if w.strip() != '' or w == '\n']

In [5]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 52


In [6]:
MIN_WORD_FREQUENCY=3
# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before ignoring: 3179
Ignoring words with frequency < 3
Unique words after ignoring: 468


In [7]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
SEQUENCE_LEN=4
STEP = 1
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 9108
Remaining sequences: 1643


In [8]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return x_train, y_train, x_test, y_test

In [9]:
sentences, next_words, sentences_test, next_words_test = shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 1610
Size of test set = 33


In [10]:
dropout = 0.2
model = Sequential()
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, len(words))))
if dropout > 0:
    model.add(Dropout(dropout))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
def generator(sentence_list, next_word_list, batch_size,generate_labels=True):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_indices[w]] = 1
            if generate_labels:
                y[i, word_indices[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        if generate_labels:
            yield x, y
        else:
            yield x

In [12]:
file_path = "checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % (
    len(words),
    SEQUENCE_LEN,
    MIN_WORD_FREQUENCY
)

on_epoch_end = 30
#checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#callbacks_list = [checkpoint, print_callback, early_stopping]
#callbacks_list = [print_callback]

optimizer = RMSprop(lr=0.05)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=["accuracy"])


BATCH_SIZE = 5
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
epochs=on_epoch_end,
#callbacks=callbacks_list
validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),  validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1
)


Instructions for updating:
Use tf.cast instead.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0xb2a1daef0>

In [13]:
def validate_seed(vocabulary, seed):
    """Validate that all the words in the seed are part of the vocabulary"""
    print("\nValidating that all the words in the seed are part of the vocabulary: ")
    seed_words = seed.split(" ")
    valid = True
    for w in seed_words:
        print(w, end="")
        if w in vocabulary:
            print(" ✓ in vocabulary")
        else:
            print(" ✗ NOT in vocabulary")
            valid = False
    return valid

In [14]:
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [18]:
quantity = 25 #quantity of words to generate
sentence = sentences[random.randint(0,len(sentences))]
print(sentence)
for i in range(quantity):
    x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
    for t, word in enumerate(sentence):
        x_pred[0, t, word_indices[word]] = 1

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 2)
    next_word = indices_word[next_index]

    sentence = sentence[1:]
    #print(sentence)
    sentence.append(next_word)

    print(" "+next_word, end="")
    #print("\n")


['at', 'the', 'same', 'time']
 when this world still 
 the work want all the will thought of this 
 metaphysical is not believe in the 
 let good is

In [128]:
x_pred[0,1,word_indices['last']] = 1

In [27]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

docs = ['Well done!',
		'Good work',
        'nice work',
		'Great effort',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])
# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[26, 7], [48, 38], [30, 38], [48, 41], [22], [31], [11, 41], [16, 48], [11, 38], [47, 11, 7, 22]]
[[26  7  0  0]
 [48 38  0  0]
 [30 38  0  0]
 [48 41  0  0]
 [22  0  0  0]
 [31  0  0  0]
 [11 41  0  0]
 [16 48  0  0]
 [11 38  0  0]
 [47 11  7 22]]
