In [1]:
try:
    import keras
except:
    !pip install keras

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
from pathlib import Path

import tensorflow as tf
tf_session = tf.Session()
from keras import backend as K
K.set_session(tf_session)

from keras.callbacks import ModelCheckpoint,  CSVLogger
from keras.layers import Add, Dense, Input, LSTM
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

import numpy as np
import pandas as pd
from sklearn.externals import joblib

# Local library with model definitions for training and generating
from models import Generator, create_training_model

# Load Input

In [4]:
# Settings

# Percent of samples to use for training, might be necessary if you're running out of memory
sample_size = 1

# The latent dimension of the LSTM
latent_dim = 2048

# Number of epochs to train for
epochs = 20

root_path = Path('../../..')
input_path = root_path / 'input'
poem_path = input_path / 'poems'
haiku_path = poem_path / 'haikus.csv'

name = 'all_data_test_2'
output_dir = Path('output_%s' % name)
output_dir.mkdir()

In [5]:
df = pd.read_csv(str(haiku_path))
df = df.sample(frac=sample_size)
df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
64191,Archaeologists,Unearth Ivory Trumpet,Dating Back On Rent,twaiku,5,67,5
99662,who wanna give me,some bread so I can go to,Japan for spring break,twaiku,5,7,56
6693,the thinning shade of autumn is,an inherited oriental,red worn to pink nap worn to thread,img2poems,8,9,8
115311,Lovers and friends from,the Pleiades What in the,world is happening,twaiku,5,7,45
82829,She don't understand,that loyalty can lead her,the long and right way,twaiku,5,7,5
58120,Today I am not,horny and other modern,day tales of horror,twaiku,56,7,5
18721,summer's end,we topple a pyramid,of beer cans,sballas,3,7,3
92769,Every time I,watch Vanguard I just wanna,play the real game RIP,twaiku,45,7,5
36198,GOODNIGHT I LOVE U,ALL TIME TO START THE SECOND,HALF OF SENIOR YEAR,twaiku,5,7,5
113621,I hope Taco Bell,doesn't close early tonight,sorry I'm starving,twaiku,5,7,5


# Format Input for Training

In [6]:
# Duplicate lines with ambiguous syllable counts
# (syllable counts where there is a comma because
# multiple pronounciations are acceptable)

lines = set([0, 1, 2])

for i in range(3):
    lines.remove(i)
    df = df[[
        '0', '1', '2',
        #'1_syllables', '2_syllables'
    ] + ['%s_syllables' % j for j in lines]].join(
        df['%s_syllables' % i].str.split(
            ',', expand=True
        ).stack(-1).reset_index(
            level=1, drop=True
        ).rename('%s_syllables' % i)
    ).drop_duplicates()
    lines.add(i)

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,2,5,5
1,spring rain -,as the doctor speaks,i think of lilacs,3,5,5
2,spring moonset --,a rice ball for,breakfast,3,4,2
2,spring moonset --,a rice ball for,breakfast,4,4,2
3,sunny afternoon,an old man lingers,near the mailbox,5,5,4
4,cinco de mayo,horses roll,in the shallows,5,3,4
5,quitting time,the smell of rain,in the lobby,3,4,4
6,waves,slowly cresting towards shore,a faint moon,1,6,3
6,waves,slowly cresting towards shore,a faint moon,1,7,3


In [7]:
# Drop samples that are longer that the 99th percentile of length

max_line_length = int(max([df['%s' % i].str.len().quantile(.99) for i in range(3)]))
df = df[
    (df['0'].str.len() <= max_line_length) & 
    (df['1'].str.len() <= max_line_length) & 
    (df['2'].str.len() <= max_line_length)
].copy()
df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,2,5,5
1,spring rain -,as the doctor speaks,i think of lilacs,3,5,5
2,spring moonset --,a rice ball for,breakfast,3,4,2
2,spring moonset --,a rice ball for,breakfast,4,4,2
3,sunny afternoon,an old man lingers,near the mailbox,5,5,4
4,cinco de mayo,horses roll,in the shallows,5,3,4
5,quitting time,the smell of rain,in the lobby,3,4,4
6,waves,slowly cresting towards shore,a faint moon,1,6,3
6,waves,slowly cresting towards shore,a faint moon,1,7,3


In [8]:
# Pad the lines to the max line length with new lines
for i in range(3):
    # For input, duplicate the first character
    # TODO - Why?
    df['%s_in' % i] = (df[str(i)].str[0] + df[str(i)]).str.pad(max_line_length+2, 'right', '\n')
    
    # 
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables,0_in,0_out,1_in,1_out,2_in,2_out
0,Memorial Day --,a shadow for each,white cross,5,5,2,MMemorial Day --\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Memorial Day --\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa shadow for each\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a shadow for each\nw\n\n\n\n\n\n\n\n\n\n\n\n\n...,wwhite cross\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,white cross\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
1,spring rain -,as the doctor speaks,i think of lilacs,2,5,5,sspring rain -\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring rain -\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aas the doctor speaks\n\n\n\n\n\n\n\n\n\n\n\n\...,as the doctor speaks\ni\n\n\n\n\n\n\n\n\n\n\n\...,ii think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,i think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
1,spring rain -,as the doctor speaks,i think of lilacs,3,5,5,sspring rain -\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring rain -\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aas the doctor speaks\n\n\n\n\n\n\n\n\n\n\n\n\...,as the doctor speaks\ni\n\n\n\n\n\n\n\n\n\n\n\...,ii think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,i think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
2,spring moonset --,a rice ball for,breakfast,3,4,2,sspring moonset --\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring moonset --\na\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa rice ball for\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a rice ball for\nb\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bbreakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,breakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
2,spring moonset --,a rice ball for,breakfast,4,4,2,sspring moonset --\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring moonset --\na\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa rice ball for\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a rice ball for\nb\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bbreakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,breakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
3,sunny afternoon,an old man lingers,near the mailbox,5,5,4,ssunny afternoon\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,sunny afternoon\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aan old man lingers\n\n\n\n\n\n\n\n\n\n\n\n\n\...,an old man lingers\nn\n\n\n\n\n\n\n\n\n\n\n\n\...,nnear the mailbox\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,near the mailbox\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
4,cinco de mayo,horses roll,in the shallows,5,3,4,ccinco de mayo\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,cinco de mayo\nh\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,hhorses roll\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,horses roll\ni\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,iin the shallows\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,in the shallows\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
5,quitting time,the smell of rain,in the lobby,3,4,4,qquitting time\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,quitting time\nt\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,tthe smell of rain\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,the smell of rain\ni\n\n\n\n\n\n\n\n\n\n\n\n\n...,iin the lobby\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,in the lobby\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
6,waves,slowly cresting towards shore,a faint moon,1,6,3,wwaves\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,waves\ns\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,sslowly cresting towards shore\n\n\n\n\n\n\n\n...,slowly cresting towards shore\na\n\n\n\n\n\n\n...,aa faint moon\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,a faint moon\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
6,waves,slowly cresting towards shore,a faint moon,1,7,3,wwaves\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,waves\ns\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,sslowly cresting towards shore\n\n\n\n\n\n\n\n...,slowly cresting towards shore\na\n\n\n\n\n\n\n...,aa faint moon\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,a faint moon\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...


In [None]:
inputs = df[['0_in', '1_in', '2_in']].values

tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
    tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
], num_classes=n_tokens)

# X_syllables is the count of syllables for each line
X_syllables = df[['0_syllables', '1_syllables', '2_syllables']].values

joblib.dump([latent_dim, n_tokens, max_line_length, tokenizer], str(output_dir / 'metadata.pkl'))

# Training Model

training_model, lstm, lines, inputs, outputs = create_training_model(latent_dim, n_tokens)

filepath = str(output_dir / ("%s-{epoch:02d}-{loss:.2f}-{val_loss:.2f}.hdf5" % latent_dim))
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

csv_logger = CSVLogger(str(output_dir / 'training_log.csv'), append=True, separator=',')

callbacks_list = [checkpoint, csv_logger]

training_model.fit([
    X[0], X_syllables[:,0], 
    X[1], X_syllables[:,1], 
    X[2], X_syllables[:,2]
], [Y[0], Y[1], Y[2]], batch_size=64, epochs=epochs, validation_split=.1, callbacks=callbacks_list)

# Test Model

generator = Generator(lstm, lines, tf_session, tokenizer, n_tokens, max_line_length)

generator.generate_haiku()