In [None]:
#import modules 
import numpy as np  
import tensorflow as tf 

In [None]:
import utils 

text = utils.load_data('data/simpsons/moes_tavern_lines.txt')[81:]
len(text)

In [None]:
int_words, int2vocab, vocab2int = utils.preprocess_data(text)

In [None]:
def get_batches(int_words, batch_size, seq_length):
    '''
    <argument>
    int_words  : array of tokenized word with shape of 1 
    batch_size : batch_size
    seq_length : number of time steps 
    </argument>
    
    :return: batches with matrix of (n_batches, 2, batch_size, sequence_length)
    '''
    word_per_batch = batch_size*seq_length
    n_batches = len(int_words) // word_per_batch
    #Keep enough words to train 
    int_words = np.array(int_words[:word_per_batch*n_batches])
    xdata = np.array(int_words).reshape(batch_size, -1)
    ydata = np.roll(xdata, -1).reshape(batch_size, -1) #shift the data by one 
    x_batches = np.split(xdata, n_batches, axis=1)
    y_batches = np.split(ydata, n_batches, axis=1)
    return np.array(list(zip(x_batches, y_batches)))

#debuging purpose test
get_batches(np.arange(20), 5, 4)

In [None]:
def generate_inputs(batch_size, seq_length):
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, etc)
    """
    inputs = tf.placeholder(tf.int32, shape=[batch_size, seq_length], name='inputs')
    targets = tf.placeholder(tf.int32, shape=[batch_size, seq_length], name='targets')
    return inputs, targets


In [None]:
def get_embed(input_data, vocab_size, embed_dim):
    return tf.contrib.layers.embed_sequence(input_data, vocab_size, embed_dim)

In [None]:
def build_rnn(batch_size, lstm_size, dropout):
    """
    <argument>
    batch_size : initial_state's batch_size
    lstm_size  : list of lstm_size, n_lstm layer = len(lstm_size) with its layer's size corresponding to 'lstm_size' item idx
    </argument>
    
    :return: tuple (cells ,initial_state)
    """
    def generate_lstm(num_layer):
        lstm = tf.contrib.rnn.BasicLSTMCell(num_layer)
        #lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=dropout)
        return lstm 
    cells = tf.contrib.rnn.MultiRNNCell([generate_lstm(size) for size in lstm_size])
    initial_state = cells.zero_state(batch_size, tf.float32)
    #initial_state = tf.identity(initial_state, name='initial_state')
    return cells, initial_state
    

In [None]:
from tensorflow.contrib import seq2seq
def build_loss(logits, targets, input_data_shape):
    loss = seq2seq.sequence_loss(logits, targets, tf.ones([input_data_shape[0], input_data_shape[1]]))
    return loss

In [None]:
def build_optimizer(loss, lr, grad_clip):
    optimizer = tf.train.AdamOptimizer(lr)
    gradients = optimizer.compute_gradients(loss)
    capped_gradients = [(tf.clip_by_value(grad, -grad_clip, grad_clip), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    return train_op 

In [None]:
class RNN:
    def __init__(self, vocab_size, lr=1e-2,batch_size=128,seq_length=20,
                 embed_dim=500, lstm_size=[620,620], dropout=0.2, grad_clip=1.0, sampling=False):
        if sampling == True:
            batch_size, seq_length = 1,1 
            
        tf.reset_default_graph()
        self.inputs, self.targets = generate_inputs(batch_size, seq_length)
        self.input_data_shape = tf.shape(self.inputs)
        self.embed = get_embed(self.inputs, vocab_size, embed_dim)
        self.cell, self.initial_state = build_rnn(batch_size, lstm_size=lstm_size,dropout=dropout)
        self.outputs, self.final_state = tf.nn.dynamic_rnn(self.cell, self.embed,
                                                           initial_state=self.initial_state, dtype=tf.float32)
        self.logits = tf.contrib.layers.fully_connected(self.outputs, vocab_size, activation_fn=None)
        self.probs = tf.nn.softmax(self.logits, name='probs')
        self.loss = build_loss(self.logits, self.targets, self.input_data_shape)
        self.train_op = build_optimizer(self.loss, lr, grad_clip)

<i><font size=5>Hyperparameters</i> 

<ul>
    <li>batch_size - Number of sequences running through the network in one pass.</li>
    <li>sequence_length - Number of words in the sequence the network is trained on. Larger is better typically, the network will learn </li>
    <li>vocab_size - Total word in vocabulary </li>
    <li>dropout - The dropout keep probability when training. If you're network is overfitting, try decreasing this.</li>
    <li>grad_clip - grading clipping threshold</li>
    <li>epoch - How many times you want to train the network</li>
    <li>learning_rate - How fast you want to train the network, the lower the slower the model to reach the minima</li>
    <li>embedding dimension - Size of word embedding </li>
    <li>lstm_size - List of n_nodes of lstm, n_layers == len(lstm_size)</li>
</ul>

In [None]:
batch_size = 128
sequence_length = 20
vocab_size = len(int2vocab)
dropout = 0.8 
grad_clip = 1. 
epoch = 100
learning_rate = 1e-3
embed_dim= 500
lstm_size = [620,620]
epoch = 120
batches = get_batches(int_words,batch_size,sequence_length)

In [None]:
model = RNN(vocab_size=vocab_size,batch_size=batch_size, 
            seq_length=sequence_length, embed_dim=embed_dim, grad_clip=grad_clip,
            dropout=dropout,lstm_size=lstm_size)
saver = tf.train.Saver()
save_dir = 'checkpoints/checkpoint.ckpt'

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    counter = 0 
    for e in range(epoch):
        new_state = sess.run(model.initial_state)
        for x,y in batches:
            feed = {model.inputs : x, 
                    model.targets : y,
                    model.initial_state: new_state}
            loss,final_state, _ = sess.run([model.loss, model.final_state, model.train_op], feed)
            new_state = final_state
        counter += 1
        print("epoch {} : {:.4f}".format(counter, loss))
    saver.save(sess, save_dir)

In [None]:
def pick_top_n(preds, vocab_size,n=5):
    """
    randomly pick word from probabilty distribution across predictions
    <argument>
    preds      : predictions with shape of (1,1, vocab_size)
    vocab_size : n_classes
    n          : pick most n probable words, if n is None or zero, argmax will be used instead of choosing from 
                 probabilty distribution.
    </argument>
    :return: one selected word from probabilty distrubution 
    """
    
    # Squeeze preds with shape of (1,1,n_classes) into (n_classes)
    p = np.squeeze(preds)
    if n == None or n == 0:
        picked = np.argmax(p)
    elif n > 0:
        p[np.argsort(p)[:-n]] = 0
        p = p/np.sum(p)
        picked = np.random.choice(vocab_size, 1, p=p)[0]
        
    return picked 

In [None]:
#Redefining hyperparameters

batch_size = 128
sequence_length = 20
vocab_size = len(int2vocab)
dropout = 0.8 
grad_clip = 1. 
epoch = 100
learning_rate = 1e-3
embed_dim= 500
lstm_size = [620,620]

In [None]:
def sampling(checkpoint, n_samples, prime='the'):
    """
    <argument>
    checkpoints   : path to checkpoint directory
    n_samples     : how many word will be generated
    prime         : starting word that will be feeded into lstm and used to generate other n_samples word. If None 
                    prime word will be random word in vocabulary 
    
    </argument>
    :return:  Generated text with length of n_samples+1 text with prime word as the first word in the text 
    """
    
    model = RNN(vocab_size=vocab_size,batch_size=batch_size, 
            seq_length=sequence_length, embed_dim=embed_dim, grad_clip=grad_clip,
            dropout=dropout,lstm_size=lstm_size, sampling=True)
    samples = []
    samples.append(prime)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        x = np.zeros((1,1))
        
        
        x[0][0] = vocab2int[prime] 
        feed = {model.inputs : x,
               model.initial_state : new_state}
        preds, new_state = sess.run([model.probs, model.final_state], feed)
        p_ = pick_top_n(preds, vocab_size)
        samples.append(int2vocab[p_])
        
        for _ in range(n_samples):
            feed = {model.inputs : x, model.initial_state : new_state}
            preds, new_state = sess.run([model.probs, model.final_state], feed)
            p_ = pick_top_n(preds, vocab_size)
            x[0][0] = p_
            samples.append(int2vocab[p_])
            
            
    generated_text = ' '.join(samples)
    punctuation = utils.punctuation_lookup()
    
    # Revert back tokenized punctuation to untokenized puncutation
    for key, value in punctuation.items():
        generated_text = generated_text.replace(value,'{}'.format(key))
    return generated_text

In [None]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
print(sampling(checkpoint, 1000, prime='hello'))