In [1]:
## Note: sequence generating code is taken from machinelearningmastery.com
## the model was written by us using keras

In [2]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Embedding
import keras
import random
from keras.callbacks import ModelCheckpoint
from Utility import Utility
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# 2 layer LSTM

In [3]:
util = Utility()
#sonnets = util.get_collab()
sonnets = util.get_other('data/limericks.txt')

In [4]:
sonnetString = []
for i in sonnets:
    sA = sonnets[i]
    sonnetString.append("")
    for k in sA:
        for j in k:
            sonnetString[-1] += j + " "

In [5]:
sequences = []
bigdata = ' '.join(sonnetString)

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([bigdata])

sequences = []
for data in sonnetString:
    encoded = tokenizer.texts_to_sequences([data])[0]
    # retrieve vocabulary size
    vocab_size = len(tokenizer.word_index) + 1
    # encode 3 words -> 1 word
    
    for i in range(3, len(encoded)):
        sequence = encoded[i-3:i+1]
        sequences.append(sequence)
    
print('Total Sequences in sonnet: %d' % len(sequences))
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')

Total Sequences in sonnet: 27503


In [394]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(75, return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(75))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
model.load_weights("check_lim/word-201.hdf5")
# compile network
adam = keras.optimizers.Adam(lr=1e-3)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 3, 10)             52470     
_________________________________________________________________
lstm_15 (LSTM)               (None, 3, 75)             25800     
_________________________________________________________________
dropout_8 (Dropout)          (None, 3, 75)             0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 75)                45300     
_________________________________________________________________
dense_8 (Dense)              (None, 5247)              398772    
Total params: 522,342
Trainable params: 522,342
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
filepath="word-{epoch:03d}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(X, y, epochs=999, batch_size=128,callbacks=callbacks_list)

In [20]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds+0.001) / temperature
    exp_preds = np.exp(preds)
    if(np.sum(exp_preds) != 0):
        preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [411]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        prediction = model.predict(encoded, verbose=0)[0]
        yhat = sample(prediction, 0.6)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [412]:
# We are training on actual words here, so can look up their syllable counts from 
# the dictionary, to perform some automatic post-processing

# Add in line breaks, punctuation, capitalization and truncation where appropriate
def post_process(sequence, max_syll=10, max_lines = 14):
    # Load syllable dictionary
    d_int = {}
    with open('data/syllable_dict_all.txt') as f:
        for line in f:
            arr = line.split()
            if arr[0] not in d_int.keys():
                d_int[arr[0]] = ' '.join(arr[1:])

    output_words = sequence.split()
    output_sonnet = output_words[0].capitalize()
    syll_counter = int(d_int[output_words[0]][-1])
    line_counter = 1

    for j in range(1,len(output_words)):
        syll_counter += int(d_int[output_words[j]][-1])
        if syll_counter > max_syll:
            # Add a sentence after each 4th line/the final line
            # Randomly add commas at ends of other lines
            if (line_counter % 4 == 0) or (line_counter == max_lines):
                output_sonnet += '.\n'
            else: 
                if random.randint(0,1) == 1:
                    output_sonnet += ',\n'
                else:
                    output_sonnet += '\n'

            # Roll over to next line, or stop if we have enough lines
            line_counter += 1
            if line_counter <= max_lines:           
                output_sonnet += output_words[j].capitalize()
            else:
                break
            syll_counter = 0
        else:
            # Capitalize i and i'll
            if (output_words[j] == 'i') or (output_words[j] == "i'll"):
                output_sonnet += ' ' + output_words[j].capitalize()
            else:
                output_sonnet += ' ' + output_words[j]

    return output_sonnet


In [416]:
for j in range(10):
    sonnet_raw = generate_seq(model, tokenizer, max_length-1, "deez nuts", 150)
    print(post_process(sonnet_raw))

Deez nuts to toronto day his hairy
But able and ill eager girls' to large,
Profane strong o wench's lies forget'st snatch they to
Blowed the rest her would delight as he ploughed through.
The shite of cunts it was fucking and quite,
Kneeling men to dearest painting strength to in
Posterity he not soundless would par I say thee,
Maketh be or quickly frighten and from in.
Wondrous feathers by arse excess life some fist and,
Swain a cane said was almost for long with
Interrupting to crowned trim to every young
Enthusiasm of rio of ten a thing but too.
Striver disadvantage they of reason her pride
Sharp and the spring of day and told my limbs and.

Deez nuts of sailors' world either did
Party eclipsed the name burthen to fuck the piece,
Of the treasure of the spring and foison of,
The year of imitated shall keep on his.
Rain of some that eye's thing should went on than
Compiled my praises strong and rest to black sensual,
Comfort those be not is her will tilted every
Time and make the lay 