In [162]:
# import statements
import keras 
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import regularizers
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
import pandas
import math

In [110]:
def isIntegerString(s):
    try:
        i = int(s)
    except ValueError:
        return False
    return True

def flattenListString(lst):
    string = ""
    for line in lst:
        string += line
    return string

In [111]:
import numpy as np

''' Returns data = [[line1], [line2],...]'''
def byLine():
    data = []
    stripped_data = []
    punctuation = [",", ":", ";", ".", "?", "!", "'"]

    data = np.loadtxt("data/shakespeare.txt", delimiter='\n', dtype='bytes').astype(str)

    # Strip all whitespace (including whitespace on last line of poems)
    for i in range(len(data)):
        line = data[i].strip()
        for punc in punctuation: # Delete punctuation
            line = line.replace(punc, "")

        if not isIntegerString(line):
            stripped_data.append(line)

    return stripped_data

''' Returns data = [[poem1], [poem2],...], where each poem is [[line1], [line2]...] if not flattened.
    If flattened, each poem is [line1line2....]. '''
def byPoem(flatten=False):
    data = []
    punctuation = [",", ":", ";", ".", "?", "!", "'"]
    with open('data/shakespeare.txt') as f:
        poem = []
        for line in f:
            for punc in punctuation: # Delete punctuation
                line = line.replace(punc, " ")

            if not isIntegerString(line):
                poem.append(line)
            elif len(poem) == 0: continue
            else:
                poem = poem[:-2] # Get rid of last two newlines
                poem[len(poem) - 1] = poem[len(poem) - 1].strip() # Get rid of newline at end of poem
                if flatten:
                    poem = flattenListString(poem)
                data.append(poem[:])
                poem = []
    return data

''' Returns data = [[stanza1], [stanza2]...] '''
def byStanza():
    data = byPoem()
    stanzaData = []
    for poem in data:
        stanzaData.append(poem[:4])
        stanzaData.append(poem[4:8])
        stanzaData.append(poem[8:12])
        stanzaData.append(poem[12:])
    return stanzaData

''' Returns training data split by sequences of length seqLength'''
def getTrainingData(seqLength):
    data = byPoem()
    segments = []
    trainingXchars = []
    trainingYchars = []
    trainingXnums = []
    trainingYnums = []

    # Get first seqLength characters, and the next character, of each line
    for poem in data:
        for line in poem:
            if len(line) > seqLength:
                segments.append([line[:seqLength], line[seqLength]])

    # Put the first seqLength characters in trainingXchars and the nex character in traininYchars
    for s in range(len(segments)):
        trainingXchars.append(segments[s][0])
        trainingYchars.append(segments[s][1])

    # Convert the characters in X and Y into numbers
    characterMap = 'abcdefghijklmnopqrstuvwxyz-() \n'

    for i in range(len(trainingXchars)):
        trainingXnums.append([characterMap.index(j) for j in trainingXchars[i].lower()])
    for i in range(len(trainingYchars)):
        trainingYnums.append([characterMap.index(j) for j in trainingYchars[i].lower()])

    trainingYnums = [y for x in trainingYnums for y in x] # Flatten list

    return (trainingXnums, trainingYnums)

In [157]:
trainingX, trainingY = getTrainingData(40)

# one-hot encode the labels
x_train = keras.utils.np_utils.to_categorical(trainingX)
y_train = keras.utils.np_utils.to_categorical(trainingY)

# normalize?
x_train = np.array([x_train])
print(x_train.shape)
print(x_train[0].shape)
x = np.reshape(x_train, (1632, 40, 30))
print(x)

(1, 1632, 40, 30)
(1632, 40, 30)
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]]

 ...

 [[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0.

In [168]:
# take sequences of 40 consecutive characters
# compare softmax output to one-hot-encoded vector representing the next character
 
#For generation, I'm feeding in the seed sequence (the first line of a sonnet), and 
#then generating a softmax distribution that I sample from to get the next character. 
#I append that character to get the new sequence after taking off the first character.

model = Sequential()
# 150 LSTM units
model.add(LSTM(150, input_shape = (40, 30), return_sequences = True))

# Flatten the input space to map from the last convolutional layer to categories
model.add(Flatten())

# add a 31 unit dense layer with softmax activation
model.add(Dense(31))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])

model.fit(x, y_train, batch_size = 10, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x134dcde48>

In [None]:
# take in the first line from an actual sonnet ('seed')
# take the last 40 character from that line
# use that as your testx
# input that into your model to predct the next character
# recursively call

Hello my name is [bob]
s
Hello my name is b[obs]

hello my name is bo[bs ]
\n
hello my name is bob[s \n]