# Creating and running the LSTM model to generate DailyMail-worthy articles

Well, this notebook is only used for rapid prototyping and debugging. For production, we will need to put all into a python script because Euler only accepts minimalistic scripts

We will need a sidebar...

In [5]:
import numpy as np
import tensorflow as tf

---
1. (Hyper)Parameter setting
---

We need to get all possible characters into once set. We also need to turn all characters into one-hot vectors:

In [None]:
filepath = 'DailyMail_title.txt'



In [3]:
# Parameters to achieve 82 perplexity within 1 day
#params = {batch_size=20,
#                seq_length=35,
#                layers=2,
#                decay=1.15,
#                rnn_size=1500,
#                dropout=0.65,
#                init_weight=0.04,
#                lr=1,
#                vocab_size=10000,
#                max_epoch=14,
#                max_max_epoch=55,
#                max_grad_norm=10
#                }

In [8]:
# Parameters to achieve 115 perplexity within 1 hour
params = {
        'data_dir' : 'data/DailyMail_titles.txt',
        'batch_size':20,
        'seq_length':20,
        'layers':2,
        'decay':2,
        'rnn_size':200,
        'dropout':0,
        'max_epoch':4,
        'max_max_epoch':13,
        'max_grad_norm':5
         }

---
2. Defining helper functions for the model
---

Consider importing already existing weights from either Karpathy or someone else

We use a predefined model (https://github.com/sherjilozair/char-rnn-tensorflow/blob/master/model.py)
simply because this project serves the purpose to get to know tensorflow a little better...

First of all, define a text and tensor loader helper class

### Helper Class 'TextLoader' for Preprocessing

In [16]:
import codecs
import os
import collections
from six.moves import cPickle

class TextLoader():
    def __init__(self, data_dir, batch_size, seq_length, input_filename, checkpoint_filename):
        self.batch_pointer = 0
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.input_file = input_filename    
        self.checkpoint_file = checkpoint_filename #format .npy
        
    def preprocess(self, input_file, tensor_file, load=False, prev_model):
        #Should really include a pointer where we left off
        #assert statements
        with open(input_file, 'r') as f:
            self.chars = list(set(data))
            self.data_size = len(data)
            self.vocab_size = len(chars)            #possible chances to actually 
            #very simple dictionary assigning number to characters
            self.char_to_ix = { ch:i for i, ch in enumerate(self.chars)}
            self.ix_to_char = { i:ch for i, ch in enumerate(self.chars)}
            
        if (load):
            with open(tensor_file, 'r') as f:
                self.tensor = np.load(tensor_file)
        else:
            self.tensor = np.array(list(map(self.)))
            #self.tensor = np.array(list(map(self.vocab.get, data))) not sure what this line is supposed to do...
        np.save(tensor_file + '_' + str(self.pointer), self.tensor) #saving the pointer value within the filename...
        
    def create_batches(self):
        batch_size = self.batch_size * self.seq_length
        self.num_batches = int(self.tensor.size / (batch_size))
        if (self.num_batches == 0):
            assert False, "Not enough data to receive batch!" #maybe a more elegant way to end this...
        self.y_batch[:-1] = np.copy(xdata[1:])
        self.y_batch[-1] = np.copy(xdata[0])
        
        
        self.x_batches = self.tensor[self.pointer * batch_size : (self.pointer + 1) * batch_size ]
        self.y_batches = np.zeros(xdatas.shape)
        
        
    def next_batch(self):
        
        self.batch_pointer += 1
        
        return True    #in case anything could go write, write some return False statements...
        
    def reset_batch_pointer(self):
        self.pointer = 0
        

Secondly, define the model that we want to train

### Model Class 'Model' defining the Multi-Layer-LSTM Network

In [17]:
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import seq2seq

class Model():
    def __init__(self, params):
        """Params currently include (for optimal 81 perplexity in 1 day) following parameters:
            {   batch_size=20,
                seq_length=35,
                num_layers=2,
                decay=1.15,
                rnn_size=1500,
                dropout=0.65,
                init_weight=0.04,
                lr=1,
                vocab_size=10000,
                max_epoch=14,
                max_max_epoch=55,
                max_grad_norm=10
            }
         """ 
        self.params = params
        self.cell = rnn_cell.MultiRNNCell([rnn_cell.BasicLSTM()] * params['num_layers'])
        
        #Handling input/output #see http://karpathy.github.io/2015/05/21/rnn-effectiveness/ for annotated one-hot version
        self.input_data = tf.placeholder(tf.int32, [params['batch_size', params['seq_length']]])
        self.targets = tf.placeholder(tf.int32, [params['batch_size'], params['seq_length']])
        self.initial_state = self.cell.zero_state(params['batch_size'], tf.float32)
        
        with tf.variable_scope('rnnlm'):
            softmax_W = tf.get_variable("softmax_w", [params['rnn_size'], params['vocab_size']])
            softmax_b = tf.get_variable("softmax_b", [params['rnn_size'], params['vocab_size']])
            
            #handle character to one-hot representation here
            #this should just map the characters to one-hot vectors... no biggie, so we don't need a function...
            with tf.device("/cpu:0"):
            #####    #simple array access should do
                embeddings = tf.get_variable("embedding",)
            ##### END OF ONE HOT DECODER
        
        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b # sample run
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # stop gradient bcs otherwise we override embedding weights
            return tf.nn.embedding_lookup(embeddings, prev_symbol)
        
        #define the logic....
        outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop, scope='rnnlm')
        output = tf.reshape(tf.concat(1, outputs), [-1, params['rnn_size']])
        self.logits = tf.matmutl(output, softmax_weights) + softmax_bias
        self.probs = tf.nn.softmax(self.logits)
        
        loss = seq2seq.sequence_loss_by_example([self.logits],
                                                [tf.reshape(self.targets, [-1])],
                                               [tf.ones([params['batch_size'] * params['seq_length']])],
                                               params['vocab_size']
                                               )
        
        self.cost = tf.reduce_sum(loss) / params['batch_size'] / params['seq_length']
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), params['grad_clip'])
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grad, tvars))
        
    #defining a single sample run...
    def sample(self, sess, chars, vocab, num=200, prime='The ', sample_type=1):
        #defining the start state (we must start from a zero starting cell)
        state = self.cell.zero_state(1, tf.float32).eval()
        for char in prime[:-1]:
            x = np.zero((1, 1))
            x[0,0] = vocab[char]
            feed = {self.input_data : x, self.initial_state:state} 
            [state] = sess.run([self.final_state], feed)    #last hidden state, must feed in the last state (recurrent shit)
            
        def weighted_pick(weights):
            #define any sampling function
            t = np.cumsum(weights)
            s = np.sum(weights)
            return(int(np.seachsorted(t, np.random.rand(1) * s)))
        
        ret = prime    #the character to be returned
        char = prime[-1]    #what character to start with...
        for n in range(num):
            x = np.zeros((1,1))
            x[0, 0] = vocab[char]
            feed = {self.input_data: x, self.initial_state:state}
            [probs, state] = sess.run([self.probs, self.final_state], feed)
            p = probs[0]
            sample = weighted_pick(p)
                
            pred = chars[sample]
            ret += pred
            char = pred
        return ret


### Third, train the model



In [1]:
def train(params):
    """Finally training the model
    params = {
        'batch_size':20,
        'seq_length':20,
        'layers':2,
        'decay':2,
        'rnn_size':200,
        'dropout':0,
        'max_epoch':4,
        'max_max_epoch':13,
        'max_grad_norm':5
    }
    """
    data_load = TextLoader(params['data_dir'],
                          params['batch_size'],
                          params['seq_length'])
    
    #save and load a cPickle file so training doesnt have to start from scratch
    
    if args.init_from is not None:
        # check if all necessary files exist 
        assert os.path.isdir(params['init_from'])," %s must be a a path" % params['init_from']
        
        ckpt = tf.train.get_checkpoint_state(args.init_from)
        assert ckpt,"No checkpoint found"
        assert ckpt.model_checkpoint_path,"No model path found in checkpoint"
        
        
    model = Model(params)
    
    #hyperloop
    
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())
        
        for e in xrange(params['max_epochs']):
            sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e)))  #maybe 
            data_loader.reset_batch_pointer()   #because one epoch is one pass through the data
            state = model.initial_state.eval() # wtf is this initial_state shit??? well, we gotta initialize anyways.... do we? no!
            for b in xrange(data_load.num_batches):
                start = time.time()
                x, y = data_loader.next_batch()
                feed = {model.input_data: x, model.targets: y, model.initial_state: state}
                train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
                end = time.time()
                
                print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(e * data_loader.num_batches + b,
                                                                                            args.num_epochs * data_loader.num_batches,
                                                                                            e, train_loss, end - start))
                if (e * data_loader.num_batches + b) % args.save_every == 0\
                    or (e==args.num_epochs-1 and b == data_loader.num_batches-1): # save for the last result
                    checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step = e * data_loader.num_batches + b)
                    print("model saved to {}".format(checkpoint_path))
        
        
        

In [19]:
import time
import os
from six.moves import cPickle

parameters = {
    'data_dir' : 'data/DailyMail',
    'save_dir' : 'save',
    'rnn_size' : 128,
    'num_layers' : 2,
    'batch_size' : 50,
    'seq_length' : 50,
    'num_epochs' : 50,
    'save_every' : 1000,
    'grad_clip' : 5.0,
    'learning_rate' : 0.002, #maybe build a neural net for that
    'decay_rate' : 0.97,
    'init_from' : 'checkpoint', # again, neural net for that ?
}

train(parameters)

def main():
    

In [None]:
x = tf.placeholder(tf.float32, [None, params['seq_length'], params['batch_size']])
y = tf.placeholder(tf.float32, [None, params['seq_length']])

In [11]:
def LSTM(x, prev_c, prev_h):
    
    #Calculate inputs
    i2h = tf.matmul(
        tf.Variable(tf.truncated_normal([params['rnn_size'], 4*params['rnn_size']], stddev=0.1), name="Wx"),
        x)
    h2h = tf.matmul(
        tf.Variable(tf.truncated_normal([params['rnn_size'], 4*params['rnn_size']], stddev=0.1), name="Wh"),
        prev_h)
    gates = tf.add(i2h + h2h)
    
    #Reshape to do effective calculations
    reshaped_gates = tf.reshape(gates, (4, params['rnn_size'])) 
    sliced_gated = tf.split(params['rnn_size']/2, 2, reshaped_gates)
    
    #Fetch individual weights
    in_gate = tf.sigmoid( sliced_gated[0] )
    in_transform = tf.tanh( sliced_gated[1] )
    forget_gate = tf.sigmoid( sliced_gated[2] )
    out_gate = tf.sigmoid( sliced_gated[3] )
    
    #Calculating output
    next_c = tf.add(
            tf.matmul(forget_gate, prev_c),
            tf.matmul(in_gate, in_transform)
            )
    next_h = tf.matmul(out_gate, tf.tanh(next_c))
    
    return next_c, next_h
    

In [13]:
with tf.Graph().as_default(), tf.Session() as sess:
    
    x = tf.placeholder(tf.float32, [None, params['batch_size'], params['seq_length']])
    y_ = tf.placeholder(tf.float32,[None, params['seq_length']] )
    

In [None]:
def create_network():
    x = tf.placeholder()
    y = tf.placeholder()
    prev_s = tf.placeholder()
    #load the current word vector into var i
    
    next_s = tf.variable()
    split = tf.split(, , prev_s)

    for layer_idx in range(params.layers):
        prev_c = tf.split()
        prev_h = tf.split()
        dropped = tf.nn.dropout(i[layer_idx - 1], params['dropout'])
        next_c, next_h = lstm(dropped, prev_c, prev_h)
        #add to tables...
        i[layer_idx] = next_h
        
    h2y = tf.Linear()
    dropped = tf.nn.dropout( i[params['layers']], params['dropout'])
    pred = tf.nn
    