# Syllable-level LSTM model

## Design notes:

The class LSTM wraps a TF model (including hyperparameters, Variables and the computation graph).

Non-TF computation (except feeding inputs) happens outside the class.

Class methods preceeded by underscore (e.g. _init_params, _lstm_step) contain TF functions and are used to build the computation graphs for training and sampling. Placeholders are defined in `_build_graph`. These 'private' methods should be called within LSTM.

Methods without underscore (`run_train`, `run_sample`) run a TF session and feed placeholder values but otherwise contain no TF functions. These 'public' methods should be called outside LSTM.

In [1]:
import numpy as np
import tensorflow as tf

In [33]:
class LSTM(object):

    def __init__(self, batch_size, embedding_size, hidden_size, vocab_size, seq_length,
                 learning_rate, decay_steps, decay_factor, sample_len, GPU=False):
        ''' Set the hyperparameters and define the computation graph.
        '''

        ''' hyperparameters '''

        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size # number of syllabs in vocab
        self.seq_length = seq_length # number of steps to unroll the LSTM for
        self.initial_learning_rate = learning_rate
        self.decay_steps = decay_steps
        self.decay_factor = decay_factor
        self.sample_len = sample_len

        # this var keeps track of the train steps within the LSTM
        self.global_step = tf.Variable(0, trainable=False)

        ''' create vars and graph '''
           
        if GPU:
            with tf.device("/gpu:0"):
                self._init_params()
                self._build_graph()
        else:
            with tf.device("/cpu:0"):
                self._init_params()
                self._build_graph()

    def _init_params(self):
        '''Create the model parameters'''
        
        # Learn an embedding for each syllable jointly with the other model params
        self.embedding = tf.Variable(tf.random_normal([self.vocab_size, self.embedding_size],
                                                      mean=0, stddev=0.2))
        self.Uf = tf.Variable(tf.random_normal([self.hidden_size, self.hidden_size],
                                       mean=0, stddev=0.2))
        self.Ui = tf.Variable(tf.random_normal([self.hidden_size, self.hidden_size],
                                       mean=0, stddev=0.2))
        self.Uo = tf.Variable(tf.random_normal([self.hidden_size, self.hidden_size],
                                       mean=0, stddev=0.2))
        self.Uc = tf.Variable(tf.random_normal([self.hidden_size, self.hidden_size],
                                               mean=0, stddev=0.2))
        self.Wf = tf.Variable(tf.random_normal([self.embedding_size, self.hidden_size],
                                               mean=0, stddev=0.2))
        self.Wi = tf.Variable(tf.random_normal([self.embedding_size, self.hidden_size],
                                               mean=0, stddev=0.2))
        self.Wo = tf.Variable(tf.random_normal([self.embedding_size, self.hidden_size],
                                               mean=0, stddev=0.2))
        self.Wc = tf.Variable(tf.random_normal([self.embedding_size, self.hidden_size],
                                               mean=0, stddev=0.2))
        self.V = tf.Variable(tf.random_normal([self.hidden_size, self.vocab_size],
                                               mean=0, stddev=0.2))
        
        self.bf = tf.Variable(tf.zeros([1, self.hidden_size]))
        self.bi = tf.Variable(tf.zeros([1, self.hidden_size]))
        self.bo = tf.Variable(tf.zeros([1, self.hidden_size]))
        self.bc = tf.Variable(tf.zeros([1, self.hidden_size]))
        self.by = tf.Variable(tf.zeros([1, self.vocab_size]))

    def _lstm_step(self, x, h, c):
        '''Performs LSTM computation for one timestep:
        takes a previous x and h, and computes the next x and h.
        '''
        
        f = tf.nn.sigmoid(tf.matmul(x, self.Wf) + tf.matmul(h, self.Uf) + self.bf)
        i = tf.nn.sigmoid(tf.matmul(x, self.Wi) + tf.matmul(h, self.Ui) + self.bi)
        o = tf.nn.sigmoid(tf.matmul(x, self.Wo) + tf.matmul(h, self.Uo) + self.bo)
        uc = tf.nn.tanh(tf.matmul(x, self.Wc) + tf.matmul(h, self.Uc) + self.bc)
        c = tf.multiply(f, c)+tf.multiply(i, uc)
        h = tf.multiply(o, tf.nn.tanh(c))
        y = tf.matmul(h, self.V)+self.by

        return y, h, c

    
    def _forward(self, inputs):
        '''Performs the forward pass for all timesteps in a sequence.
        '''
        # Create list to hold y
        y = [_ for _ in range(self.seq_length)]

        # Create zero-d initial hidden state
        h = tf.zeros([self.batch_size, self.hidden_size])
        c = tf.zeros([self.batch_size, self.hidden_size])
        
        
        for t in range(self.seq_length):
            x = tf.nn.embedding_lookup(self.embedding, inputs[:, t])
            y[t], h, c = self._lstm_step(x, h, c)

        return y

    
    def _sample_one(self, input_syllab, input_hidden, cell_state, temperature):
        '''Sample the single next syllable in a sequence.'''

        # We expand dims because tf expects a batch
        syllab = tf.expand_dims(input_syllab, 0)

        # Get the embedding for the input syllable
        x = tf.nn.embedding_lookup(self.embedding, syllab)

        # Take a single lstm step
        y, h, c = self._lstm_step(x, input_hidden, cell_state)

        # Dividing the unnormalized probabilities by the temperature before 
        # tf.multinomial is equivalent to adding temperature to a softmax
        # before sampling
        y_temperature = y / temperature

        # We use tf.squeeze to remove the unnecessary [batch, num_samples] dims
        # We do not manually softmax - tf.multinomial softmaxes the tensor we pass it
        next_sample = tf.squeeze(tf.multinomial(y_temperature, 1))

        return next_sample, h, c, y


    def _build_graph(self):
        '''Build the computation graphs for training and sampling.'''


        '''Sampling and test graph'''
        self.sample_input_syllab = tf.placeholder(dtype=tf.int32, shape=[])
        self.sample_cell_state = tf.placeholder(dtype=tf.float32, shape=[1, self.hidden_size])
        self.sample_input_hidden = tf.placeholder(dtype=tf.float32, shape=[1, self.hidden_size])
        
        self.test_syllab = tf.placeholder(dtype=tf.int32, shape=[])
        
        self.temperature = tf.placeholder_with_default(1.0, [])

        self.next_sample, self.next_hidden, self.next_cell, self.next_predictions = self._sample_one(
            self.sample_input_syllab, self.sample_input_hidden, self.sample_cell_state, self.temperature)
        
        self.next_softmax_predictions = tf.nn.softmax(self.next_predictions)
                
        self.test_syllab_prob = tf.reduce_sum(self.next_softmax_predictions * tf.one_hot(
            tf.expand_dims(self.test_syllab, axis=0), depth=self.vocab_size))
        
        # Get cross entropy in base 2
        # log_2 (x) =  log_e (x) / log_e(2)
        self.binary_xentropy = - tf.log(self.test_syllab_prob) / tf.log(2.0)


        '''Training graph'''
        self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, self.seq_length])
        self.targets = tf.placeholder(dtype=tf.int32, shape=[None, self.seq_length])
        self.predictions = self._forward(self.inputs)

        cost_per_timestep_per_example = [
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.predictions[t],
                    labels=self.targets[:, t])
                for t in range(self.seq_length)
        ]

        # Use reduce_mean rather than reduce_sum over the examples in batch so that
        # we don't need to change the learning rate when we change the batch size.
        cost_per_timestep = [tf.reduce_mean(cost) for cost in cost_per_timestep_per_example]

        # Use reduce_mean here too so we don't need to change the learning rate when
        # we change number of timesteps.
        self.cost = tf.reduce_mean(cost_per_timestep)

        # Decay the learning rate according to a schedule.
        self.learning_rate = tf.train.exponential_decay(self.initial_learning_rate,
                                                        self.global_step,
                                                        self.decay_steps,
                                                        self.decay_factor)
        
        self.train_step = tf.train.RMSPropOptimizer(self.learning_rate).minimize(
            self.cost, global_step=self.global_step)


        '''Finished creating graph: start session and init vars'''
        config = tf.ConfigProto(allow_soft_placement = True)
        self.sess = tf.Session(config = config)
        self.sess.run(tf.global_variables_initializer())


    def run_train(self, input_syllabs, target_syllabs):
        '''Call this from outside the class to run a train step'''
        cost, lr, _ = self.sess.run([self.cost, self.learning_rate, self.train_step],
                                   feed_dict={
                                       self.inputs: input_syllabs,
                                       self.targets: target_syllabs
                                   })
        return cost, lr

In [41]:
def run_sample(model, n, starter_syllab, primer_seq=None, temperature=1.0):
    '''Samples a length-n sequence from the model'''

    sampled_syllabs = [_ for _ in range(n)]
    current_syllab = starter_syllab
    h = np.zeros([1, model.hidden_size])
    cs = np.zeros([1, model.hidden_size])

    if primer_seq is not None:
        for c in primer_seq:
            h, cs = model.sess.run(
                [model.next_hidden, model.next_cell],
                feed_dict={
                    model.sample_input_syllab: c,
                    model.sample_input_hidden: h,
                    model.sample_cell_state: cs
                })

    for i in range(n):

        current_syllab, h, cs = model.sess.run(
            [model.next_sample, model.next_hidden, model.next_cell],
            feed_dict={
                model.sample_input_syllab: current_syllab,
                model.sample_input_hidden: h,
                model.sample_cell_state: cs,
                model.temperature: temperature})

        sampled_syllabs[i] = current_syllab

    return sampled_syllabs

def run_test(model, test_syllabs, primer_seq=None):
    '''Finds the cross entropy on a dataset.
    test_syllabs and primer_seq should be lists of ints.'''

    xentropy_accum = 0.0
    h = np.zeros([1, model.hidden_size])
    cs = np.zeros([1, model.hidden_size])

    if primer_seq is not None:
        for c in primer_seq:
            h, cs = model.sess.run(
                [model.next_hidden, model.next_cell],
                feed_dict={
                    model.sample_input_syllab: c,
                    model.sample_input_hidden: h,
                    model.sample_cell_state: cs
                })

    for i in range(len(test_syllabs) - 1):
        xentropy, h, cs  = model.sess.run(
            [model.binary_xentropy, model.next_hidden, model.next_cell],
            feed_dict={
                model.sample_input_syllab: test_syllabs[i],
                model.sample_input_hidden: h,
                model.sample_cell_state: cs,
                model.test_syllab: test_syllabs[i+1]
            })

        xentropy_accum += (xentropy / len(test_syllabs))

    xentropy_avg = xentropy_accum 

    return xentropy_avg 

In [80]:
'''Train and sample from our model'''

# data I/O
corpus = open('../data/output/dr_seuss_phones.txt', 'r').read().split(" ") # should be simple plain text file
data = corpus#[:int(len(corpus)*0.9)]
syllabs = list(set(data))
data_size, vocab_size = len(data), len(syllabs)
print 'data has %d syllables, %d unique.' % (data_size, vocab_size)
syllab_to_ix = { s:i for i,s in enumerate(syllabs) }
ix_to_syllab = { i:s for i,s in enumerate(syllabs) }


# hyperparameters
embedding_size = 32 # size of embedding
hidden_size = 256 # size of hidden layers of neurons
seq_length = 50 # number of steps to unroll the LSTM for
learning_rate = 1e-2
decay_steps = 500
decay_factor = 0.9
sample_len = 500

batch_size = 128

n_train_steps = 1

# model parameters
lstm = LSTM(batch_size, embedding_size, hidden_size, vocab_size, 
          seq_length, learning_rate, decay_steps, decay_factor, 
          sample_len, GPU=True)

smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

for n in range(n_train_steps):
    
    # prepare inputs 
    inputs = np.empty([batch_size, seq_length])
    targets = np.empty([batch_size, seq_length])
    
    for i in range(batch_size):
        # randomly index into the data for each example in batch
        random_index = int(np.random.rand() * (data_size - seq_length - 1))
        inputs[i, :] = [syllab_to_ix[ch] for ch in data[random_index:random_index+seq_length]]
        targets[i, :] = [syllab_to_ix[ch] for ch in data[random_index+1:random_index+seq_length+1]]
        
    loss, lr = lstm.run_train(inputs, targets)
    
    # print progress
    if n % 100 == 0: 
        print 'iter %d, loss: %f, learning rate: %f' % (n, loss, lr) 

    # sample from the model now and then
    if n % 1000 == 0:
        sample_ix = lstm.run_sample(sample_len, inputs[0, 0], 1.0)
        txt = ' '.join(ix_to_syllab[ix] for ix in sample_ix)
        print '----\n %s \n----' % (txt, )

data has 28842 syllables, 41 unique.
iter 0, loss: 3.900543, learning rate: 0.010000
----
 ey g ae l f | jh z t ch ng t uh | | ay dh th w r ow n dh f p d | ow f  
 aa k aa aa w t aw ey z l b 
 t er ih r aw r eh ay eh jh ae z ow dh th uh g g ah uh g uh n aa ey g ey z ch ah ih l ay t z g r  b 
 t g b er y ah n jh aw uh aa g ow g p sh aw g ey z oy k eh d eh k ih z ow uh  | ih ih ow ih oy er uh th dh sh dh ah jh ah jh  uh | ng ch s l | ey sh iy ae t dh k g k ao t m f er l aw t dh f ow b ae 
 k t 
 ng hh t ah f ng jh jh iy iy er ih k hh b ch ng oy jh 
 hh dh r ae | ae z hh z n jh ae sh iy z ng ao iy iy aw er ow aw ch ih p ch ey ng n eh | ch ng ao ao n m ay eh ng sh k  p l eh f b | dh ng er ch ch ch ng ch ng jh dh ch uw dh uw k th m jh th p  th jh |  m ch l aw y uh | ih  jh g ch uh m ah r oy r ow ow jh dh b aw b ey ah aw aw eh iy  w jh m er aa ey d uw iy |  | z ow ao eh | eh jh th ey hh eh l b oy ng 
 oy l s th er ey er iy er oy th f aw th r sh w ay ao ey z jh z sh ah jh ah ae s dh er uh t z

In [73]:
# cross-entropy on test set
test = corpus[int(len(corpus)*0.9):]
primer = data[-1000:]
lstm.run_test([syllab_to_ix[ch] for ch in test], [syllab_to_ix[ch] for ch in primer])

4.2969801608087677

In [89]:
primer = open('../data/output/article3_phones.txt', 'r').read().split(" ")
sample_ix = run_sample(eminem, 10*sample_len, inputs[0, 0], [syllab_to_ix[ch] for ch in primer], 1)
txt = ' '.join(ix_to_syllab[ix] for ix in sample_ix)
print '%s' % (txt, )

KeyError: 'zh'

## Sampling with high temperature:

In [60]:
sample_ix = lstm.run_sample(sample_len, inputs[0, 0], 100)
txt = ''.join(ix_to_syllab[ix] for ix in sample_ix)
print '%s' % (txt, )

zseyuwngshayeyahzhaeahaodhkbaafehmerhhiywsaheherehoyhh|hhaeae|iyoytawngkersthaomfihiychaezhhhae
puherjhzhbeyehsgiyerereruhaomwhhoylwjheychyayddhaollynaapaozviheyeroyb
ptheriyfdheyerbgfg
biyhhuhaoehlowgayeyykbyayngzhphhaolwthhhzhuwndhoyshow
vmuwnawzhwznihahowerzhfhh|roykuhsmytkhhuwrnaaehwnszhtfawpngshthshzhslshowfuhshrgawng
nngfaydhzhdlgehwiyersheyjhuwzhah|shayuwowchkthmah
oythl|vwaeeytdhawuwngaw
hhbvffihiydherahzhiyihchthahvhhaewbowldhoyoyaobchteyzhuwlzeyihzaodhaemsfkahfchnerpjhayahayuhehhhowih
sahehowshrdheyeraardch
eyawchaojhthlowlnghhuwchihereymayaeaobgzihz
zh||ihgbvnglehihlshehaeziyerfuh
ayuhowzkng|chl
faozdhhnglpaengwhherrkuwaommwzheythruhuhaoihhhfzheyhhaapngrnpeydaafluhhhzhhhsshtuhaetehshtd|pgkwwuhpmert
mpjhvehverslzhaaweyoyshahfmdfjh|
bchvdhaymchoy|sht|ngdherzs


## Sampling with low temperature:

In [59]:
sample_ix = lstm.run_sample(sample_len, inputs[0, 0], 0.001)
txt = ' '.join(ix_to_syllab[ix] for ix in sample_ix)
print '%s' % to_wordst(txt)


he's inherited anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything anything any 


a soldier i'm living in a vibe let's go back 
throw your back then crazy 
ok when i'm one around about my 
switch it again and saran get to much to lose your man 
you don't know the crazy first 

as a people that i got your name 
i came and word though 
but cops there stopping in the worn of my skin is your forever dead dreams 
and remember me 

my darling your going from bagdad 
this time i'm bad 
seize all about to kill everywhere every sense 
and i won't be ran 
and care a 
my mother says 

i imagine death's door when we were needy 
we made a promise we signed a treaty 
i may have a friend 
chance to survive this 
stay alive high 
running out of time 

day you fight like you're 
running out of time 
day you fight like you're 
out your gun 
they surround our troops 
when they surround our troops 
they surround our in tolle than in new york you can 

even before we got to take a stand with the stamina god has gone gray he passes every day 
they say he walks the length of the plan was to write a total of twenty five essays
you and your words flooded my senses 
your sentences left me defenseless 

he's never be president now 
that's one less thing to worry about 
where is it uptown 
and he just destroyed president 
weapon 

you'll be back time will tell 
you'll remember that i served you well 
boy your debts someone load his reputation 
welcome the cabinet i am in this show 