In [1]:
# Import the required libraries
import os
import numpy as np
import tensorflow as tf

In [2]:
# We read the text(pg2265.txt) to remove the beginning
# portion(that contains legal description), then construct 
# the dictionary based on the text

# Processing the document
with open('pg2265.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
# Excluding the legal portion    
text = text[15858:]

# Getting all the unique characters in the text
chars = set(text)

# Dictionary that maps each character to a integer
char2int = {ch:i for i, ch in enumerate(chars)}

# Dictionary that maps integers to characters
int2char = dict(enumerate(chars))

text_ints = np.array([char2int[ch] for ch in text], dtype=np.int32)

In [3]:
print(char2int)

{';': 0, '!': 1, 'R': 2, '[': 3, 'w': 4, 't': 5, "'": 6, 'k': 7, 'G': 8, 'K': 9, ']': 10, 'm': 11, 'D': 12, 'o': 13, 'x': 14, 'F': 15, 'p': 16, ' ': 17, 'B': 18, 'c': 19, 'i': 20, '(': 21, '1': 22, 'E': 23, 'd': 24, 'b': 25, 'L': 26, 's': 27, 'P': 28, '-': 29, ')': 30, '&': 31, 'v': 32, 'O': 33, 'f': 34, 'C': 35, 'H': 36, 'a': 37, 'Y': 38, 'Z': 39, 'I': 40, 'T': 41, 'j': 42, '.': 43, 'g': 44, 'A': 45, ',': 46, '\n': 47, 'y': 48, ':': 49, 'u': 50, 'W': 51, 'M': 52, 'h': 53, 'q': 54, 'z': 55, 'e': 56, 'r': 57, 'V': 58, 'S': 59, '?': 60, 'Q': 61, 'n': 62, 'l': 63, 'N': 64}


In [4]:
print(int2char)

{0: ';', 1: '!', 2: 'R', 3: '[', 4: 'w', 5: 't', 6: "'", 7: 'k', 8: 'G', 9: 'K', 10: ']', 11: 'm', 12: 'D', 13: 'o', 14: 'x', 15: 'F', 16: 'p', 17: ' ', 18: 'B', 19: 'c', 20: 'i', 21: '(', 22: '1', 23: 'E', 24: 'd', 25: 'b', 26: 'L', 27: 's', 28: 'P', 29: '-', 30: ')', 31: '&', 32: 'v', 33: 'O', 34: 'f', 35: 'C', 36: 'H', 37: 'a', 38: 'Y', 39: 'Z', 40: 'I', 41: 'T', 42: 'j', 43: '.', 44: 'g', 45: 'A', 46: ',', 47: '\n', 48: 'y', 49: ':', 50: 'u', 51: 'W', 52: 'M', 53: 'h', 54: 'q', 55: 'z', 56: 'e', 57: 'r', 58: 'V', 59: 'S', 60: '?', 61: 'Q', 62: 'n', 63: 'l', 64: 'N'}


In [5]:
print(text_ints)

[47 41 53 ..., 56 43 47]


In [6]:
# Reshaping the data into batches of sequences
def reshape_data(sequence, batch_size, num_steps):
    
    batch_length = batch_size * num_steps
    num_batches = int(len(sequence) / batch_length)
    
    if num_batches*batch_length + 1 > len(sequence):
        num_batches = num_batches - 1
        
    # Truncate the sequence at the end to get rid of 
    # remaining charcaters that do not make a full batch
    x = sequence[0 : num_batches*batch_length]
    y = sequence[1 : num_batches*batch_length + 1]
    
    # Split x & y into a list batches of sequences: 
    x_batch_splits = np.split(x, batch_size)
    y_batch_splits = np.split(y, batch_size)
    
    # Stack the batches together: batch_size*batch_length
    x = np.stack(x_batch_splits)
    y = np.stack(y_batch_splits)
    
    return(x,y)    

In [7]:
# Splitting x & y into mini-batches where each row is
# a sequence with length = num_steps

def create_batch_generator(data_x, data_y, num_steps):
    batch_size, batch_length = data_x.shape
    
    num_batches = int(batch_length/num_steps)
    
    for b in range(num_batches):
        yield(data_x[:, b*num_steps:(b+1)*num_steps],
              data_y[:, b*num_steps:(b+1)*num_steps])
    

In [8]:
# Defining RNN class 

class CharacterRNN(object):
    
    # boolean sampling decides whether instance is for building 
    # the graph in training mode or sampling mode
    
    def __init__(self, num_classes, batch_size=64, 
                 num_steps=100, lstm_size=128, num_layers=1,
                 learning_rate=0.001, keep_prob=0.5, 
                 grad_clip=5, sampling=False):
        
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.grad_clip = grad_clip
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            
            self.build(sampling=sampling)
            
            self.saver = tf.train.Saver()
            
            self.init_op = tf.global_variables_initializer()
        
        
    # Builds the computation graph for our RNN
    def build(self, sampling):
        if(sampling==True):
            batch_size, num_steps = 1, 1
        else:
            batch_size = self.batch_size
            num_steps = self.num_steps

        # Defining the input placeholders        
        tf_x = tf.placeholder(tf.int32, 
                              shape=[batch_size, num_steps],
                              name='tf_x')

        tf_y = tf.placeholder(tf.int32, 
                              shape=[batch_size, num_steps],
                              name='tf_y')

        tf_keepprob = tf.placeholder(tf.float32, 
                                     name='tf_keepprob')

        # One-hot encoding
        x_onehot = tf.one_hot(tf_x, depth=self.num_classes)
        y_onehot = tf.one_hot(tf_y, depth=self.num_classes)
        
        # Building the Multi-Layered RNN cell
        cells = tf.contrib.rnn.MultiRNNCell(
                    [tf.contrib.rnn.DropoutWrapper(
                        tf.contrib.rnn.BasicLSTMCell(self.lstm_size), 
                        output_keep_prob=tf_keepprob)
                     for _ in range(self.num_layers)])
        
        # Defining the initial state(zeros with required dimensions)
        self.initial_state = cells.zero_state(batch_size, tf.float32)
        
        # Run each sequence step through the RNN
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cells, x_onehot,
                                                           initial_state=self.initial_state)
        
        print(' <<< lstm_outputs >>>', lstm_outputs)
        
        seq_output_reshaped = tf.reshape(lstm_outputs, shape=[-1, self.lstm_size],
                                         name='seq_output_reshaped')
        
        logits = tf.layers.dense(inputs=seq_output_reshaped,
                                 units=self.num_classes,
                                 activation=None,
                                 name='logits')
        
        
        proba = tf.nn.softmax(logits, name='probabilities')
        print(proba)
        
        y_reshaped = tf.reshape(y_onehot, shape=[-1, self.num_classes],
                                name='y_reshaped')
        
        cost = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(logits=logits, 
                                                            labels=y_reshaped),
                    name='cost')
        
        # Gradient Clipping to avoid exploding gradients
        tvars = tf.trainable_variables()
        
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 
                                          self.grad_clip)
        
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        
        train_op = optimizer.apply_gradients(zip(grads, tvars), name='train_op')
        
    # Training our RNN    
    def train(self, train_x, train_y,
              num_epochs, ckpt_dir='./model/'):
        
        # Creating the check-point directory
        if not os.path.exists(ckpt_dir):
            os.mkdir(ckpt_dir)
            
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            
            n_batches = int(train_x.shape[1] / self.num_steps)
            iterations = n_batches*num_epochs
            
            for epoch in range(num_epochs):
                
                # Train Network
                new_state = sess.run(self.initial_state)
                loss = 0
                
                # Generates Batches
                bGen = create_batch_generator(train_x, train_y, self.num_steps)
                
                for b, (batch_x, batch_y) in enumerate(bGen, 1):
                    iteration = epoch*n_batches + b
        
                    feed = {
                            'tf_x:0' : batch_x,
                            'tf_y:0' : batch_y,
                            'tf_keepprob:0' : self.keep_prob,
                            self.initial_state : new_state
                    }
            
                    batch_cost, _, new_state = sess.run(['cost:0', 'train_op', 
                                                         self.final_state], feed_dict = feed)
                
                    if iteration % 10 == 0:
                        
                        print('Epoch %d/%d Iteration %d'
                              '| Training loss: %.4f' % (epoch+1, num_epochs, iteration,
                                                       batch_cost))
                        
                # Saving the trained model
                self.saver.save(sess, os.path.join(ckpt_dir, 'language_modelling.ckpt'))
    
    # Predictions for RNN
    def sample(self, output_length, 
               ckpt_dir, starter_seq="The "):
        
        observed_seq = [ch for ch in starter_seq]    
        
        with tf.Session(graph=self.g) as sess:
            self.saver.restore(
                sess, 
                tf.train.latest_checkpoint(ckpt_dir))
            
            # Running the model using the starter sequence
            new_state = sess.run(self.initial_state)
            for ch in starter_seq:
                
                x = np.zeros((1, 1))
                x[0,0] = char2int[ch]
                feed = {'tf_x:0': x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: new_state}
                
                proba, new_state = sess.run(
                        ['probabilities:0', self.final_state], 
                        feed_dict=feed)

            ch_id = get_top_char(proba, len(chars))
            observed_seq.append(int2char[ch_id])
            
            # Running the model using the updated observed_seq
            for i in range(output_length):
                
                x[0,0] = ch_id
                feed = {'tf_x:0': x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state: new_state}
                
                proba, new_state = sess.run(
                        ['probabilities:0', self.final_state], 
                        feed_dict=feed)

                ch_id = get_top_char(proba, len(chars))
                observed_seq.append(int2char[ch_id])

        return ''.join(observed_seq) 
        

In [9]:
# Method to return to characters    
def get_top_char(probas, char_size, top_n=5):
    p = np.squeeze(probas)

    # Sort the characters in descending order
    p[np.argsort(p)[:-top_n]] = 0.0

    p = p/np.sum(p)

    # Return a randomly chosen character from top ones
    char_id = np.random.choice(char_size, 1, p=p)[0]

    return(char_id)

In [10]:
batch_size = 64
num_steps = 100

train_x, train_y = reshape_data(text_ints, 
                                batch_size, 
                                num_steps)

# Creating RNN onject
rnn = CharacterRNN(num_classes=len(chars), batch_size=batch_size)

# Training our RNN
rnn.train(train_x, train_y, 
          num_epochs=20,
          ckpt_dir='./model-20/')

 <<< lstm_outputs >>> Tensor("rnn/transpose_1:0", shape=(64, 100, 128), dtype=float32)
Tensor("probabilities:0", shape=(6400, 65), dtype=float32)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

Epoch 1/20 Iteration 10| Training loss: 3.6465
Epoch 1/20 Iteration 20| Training loss: 3.3772
Epoch 2/20 Iteration 30| Training loss: 3.2907
Epoch 2/20 Iteration 40| Training loss: 3.2471
Epoch 2/20 Iteration 50| Training loss: 3.2281
Epoch 3/20 Iteration 60| Training loss: 3.2198
Epoch 3/20 Iteration 70| Training loss: 3.1951
Epoch 4/20 Iteration 80| Training loss: 3.1769
Epoch 4/20 Iteration 90| Training loss: 3.1508
Epoch 4/20 Iteration 100| Training loss: 3.1378
Epoch 5/20 Iteration 110| Training loss: 3.1304
Epoch 5/20 Iteration 120| Training loss: 3.1029
Epoch 6/20 Iteration 130| Training loss: 3.0806
Epoch 6/20 Iteration 140| Training loss: 3.0533

In [11]:
# We create a new instance of the CharacterRNN() class in the sampling mode
# mode by specifying sampling=True

np.random.seed(123)

# Creating a RNN object with sampling mode
rnn = CharacterRNN(len(chars), sampling=True)

# Generate a sequence of 500 characters
print(rnn.sample(ckpt_dir='./model-20/', output_length=500))

 <<< lstm_outputs >>> Tensor("rnn/transpose_1:0", shape=(1, 1, 128), dtype=float32)
Tensor("probabilities:0", shape=(1, 65), dtype=float32)
INFO:tensorflow:Restoring parameters from ./model-20/language_modelling.ckpt
The as shand the win the heer of it oue merar the mess and ther and,
Hat sond wart sing and ther and tis so the sis thas iter ther ind, bertis it and,
Har as it and sathe hearthen to my were me thart, an tourd thet
  am. I the he that tine te sould in shis ofrert ite mit or toue sinte thore


   Hol. Hat. wil  aur thandes the wise herde the terie thangeris to d ion the mere se ang meser indt on this ande hor too hame the terine mat tis at in aster it arer tore mang so mind at thit shoue soree tere so
