In [1]:
import numpy as np
from io import StringIO


""" Future ideas:
    * Create Variable Batch Sizes: Transition between batches are then included in modeling the language... any possible good?
"""

class TextLoader():
    """ Helper function to load in (batches of) text """
    
    def __init__(self, batch_size, seq_length, input_file):
        """ Load data to generate text from. Set batch pointer to zero."""
        self.ip = 0 #'insruction' pointer
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.input_file = input_file
        
        with open(self.input_file, 'r') as text_source:
            #self.text = text = np.loadtxt(text_source)
            #print self.text
            self.text = text_source.readlines()[0]
            #print(text)
            #must open up file..
            
            self.chars = list(set(self.text))
            print(sorted(self.chars))
            
            self.vocab_size = len(self.chars)
            print("Vocab size %d" % self.vocab_size)
            
            self.data_size = len(self.text)
            print("Entire data size %d" % self.data_size)
                
        self.map_char2num = { ch:i for i,ch in enumerate(self.chars) }
        self.map_num2char = { i:ch for i,ch in enumerate(self.chars) }
        
        self.num_batches = self.data_size / (self.batch_size * self.seq_length)
        print("Number of batches %d" % self.num_batches)
        self.epochs_through = 0
        
        self.create_batches()
        
    def create_batches(self):
        """ Creates a number of batches, saved in self.x_batches and self.y_batches"""
        
        if (self.num_batches == 0):
            assert False, "Not enough data to receive batch! :: In function next_batch_pointer of class TextLoader()"
        
        processed_text = self.char2num(self.text)   #turn into numbers 
        processed_text = processed_text[:self.num_batches * self.batch_size * self.seq_length]
        
        x = processed_text
        y = np.zeros(x.shape)
        y[:-1] = np.copy(x[1:])
        y[-1] = np.copy(x[0])
        
        self.X_batches = np.split(x, self.num_batches)
        self.y_batches = np.split(y, self.num_batches)     
        
        
    #################
    #Helper functions
    def char2num(self, char_arr):
        out = np.zeros(len(char_arr), dtype=np.int32)  #will be changed to one-hot anyways... #dtype=float32 if worked on this
        for i in range(out.shape[0]):
            out[i] = self.map_char2num[ char_arr[i] ]
        return out
    
    def num2char(self, num_arr):
        out = ""
        for i in range(num_arr.shape[0]):
            out += self.map_char2num[ num_arr[i] ]
        #out = np.zeros(len(char_arr), dtype=np.int32)  #will be changed to one-hot anyways... #dtype=float32 if worked on this
        return out
    #Helper functions
    #################
    
    
    def get_next_batch(self):
        """ Get next batch of data. This should be a multiple of a sequence 
        length, and a multiple of the batch size """
        x, y = self.X_batches[self.ip], self.y_batches[self.ip]
        self.ip += 1
        if self.ip == self.num_batches:
            self.epochs_through += 1
            print "Epochs through: ", self.epochs_through
            self.reset_batch_pointer()
            
        return x, y
           
    def reset_batch_pointer(self):
        self.ip = 0
        return True
    

if __name__ == "__main__":
    TextLoader = TextLoader(batch_size = 32, 
                            seq_length = 128, #the longer the better
                            input_file = 'data/dailymail_header/input.txt'
                           )
    
    iterations_for_epoch = (TextLoader.data_size) / (TextLoader.batch_size * TextLoader.seq_length) #bcs indexing is from zero...
    print
    print "Iterations through data", iterations_for_epoch
    print
    
    print "Number of batches saved in X_batches:", len(TextLoader.X_batches)
    
    for _ in range(iterations_for_epoch * 10):
        x, y = TextLoader.get_next_batch()
        print x.shape, y.shape
    
        

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|']
Vocab size 82
Entire data size 199442
Number of batches 48

Iterations through data 48

Number of batches saved in X_batches: 48
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (4096,)
(4096,) (409

In [12]:
import numpy as np 
import tensorflow as tf 
from tensorflow.python.ops import rnn_cell
from tensorflow.python.ops import seq2seq
import pip
import os

from tensorflow.python.ops import rnn


class Model():
    def __init__(self, batch_size, seq_length, lstm_size, num_layers, grad_clip, vocab_size):
        """ Build the actual model """
        #Define crucial hyperparameters / parameters
        self.lr = tf.Variable(0.0, trainable=False)        
        
        #Define input and output
        self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.output_data = tf.placeholder(tf.int32, [batch_size, seq_length])
        
        #Define the model
        cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size) #can choose if basic or otherwise later on...
        self.cell = cell = rnn_cell.MultiRNNCell([cell] * num_layers)
        self.initial_state = cell.zero_state(batch_size, tf.float32)
        
        with tf.variable_scope("lstm", reuse=True):
            softmax_w = tf.get_variable("softmax_w", [lstm_size, vocab_size])
            softmax_b = tf.get_variable("softmax_b", [vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [vocab_size, lstm_size])
                inputs = tf.nn.embedding_lookup(embedding, self.input_data)
                #some further processing needed?
                
        
        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)
        
        outputs, states = seq2seq.rnn_decoder(
                                            inputs, 
                                            self.initial_state, 
                                            cell, 
                                            loop_function=loop,
                                            scope='lstm')
        
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        
        loss = seq2seq.sequence_loss_by_example(
                                                [self.logits],
                                                [self.output_data],
                                                [tf.ones([batch_size * seq_length])],
                                                vocab_size
                                                )
        self.cost = tf.reduce_sum(loss) / batch_size / seq_length
        self.final_state = last_state
        
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # what happens for one single iteration
        
        
if __name__ == "__main__":
    pass
    Model = Model(batch_size=32, 
                  seq_length=128, 
                  lstm_size=512, 
                  num_layers=2, 
                  grad_clip=5,
                  vocab_size=82
                 )
    
    

ValueError: Variable lstm/embedding does not exist, disallowed. Did you mean to set reuse=None in VarScope?

In [8]:
def train(num_epochs, learning_rate, decay_rate, batch_size, seq_length, input_file):

	TextLoader = TextLoader(batch_size = batch_size, 
                            seq_length = seq_length, #the longer the better
                            input_file = input_file
                           )
    
    iterations_for_epoch = (TextLoader.data_size) / (TextLoader.batch_size * TextLoader.seq_length) #bcs indexing is from zero...
    

	model = Model(batch_size=32, 
                  seq_length=128, 
                  lstm_size=512, 
                  num_layers=2, 
                  grad_clip=5,
                  vocab_size=TextLoader.vocab_size
                 )

	with tf.Sess() as sess:
		tf.initialize_all_variables().run()

		#check again what substitutes one epoch...
		for e in range(num_epochs):
			tf.assign(model.lr, learning_rate * (decay_rate ** e))
			TextLoader.reset_batch_pointer()
            state = sess.run(model.initial_state) #why initial_state?...

            for b in range(TextLoader.num_batches):
                start = time.time()
                x, y = TextLoader.get_next_batch()
                feed = {
                		model.input_data: x, 
                		model.output_data: y, 
                		model.initial_state: state
                		}
                train_loss, state, _ = sess.run([model.cost, model.final_state, model.train_op], feed)
                end = time.time()
                print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(e * TextLoader.num_batches + b,
                            num_epochs * TextLoader.num_batches,
                            e, train_loss, end - start))
                


if __name__ == "__main__":
	train(
		num_epochs = 5, 
		learning_rate = 1e-2, 
		decay_rate = 0.8, 
		batch_size = 32, 
		seq_length = 128, 
		input_file = 'data/dailymail_header/input.txt'
		)


IndentationError: unindent does not match any outer indentation level (<ipython-input-8-2674f478c680>, line 8)