# Recurrent Neural Network w/ `tensorflow`

In [1]:
import os.path
import random
import datetime

import numpy as np
import tensorflow as tf

## data file path

In [2]:
# data files
data_dir = '../datasets/wikitext-2-raw'
train_file = 'wiki.train.raw'
test_file = 'wiki.test.raw'
valid_file = 'wiki.valid.raw'

# Model checkpoints
chkpt_dir = 'chkpt_dir/'
chkpt_model = os.path.join(chkpt_dir, 'model')

# create a check point dir
if tf.gfile.Exists(chkpt_dir):
    tf.gfile.DeleteRecursively(chkpt_dir)
tf.gfile.MakeDirs(chkpt_dir)

## read training data

In [3]:
data = open(os.path.join(data_dir, valid_file), 'r').read()
data = data[:10000]
print('Number of characters is {:,}'.format(len(data)))

Number of characters is 10,000


## pre-processing

In [4]:
chars = sorted(list(set(data)))
char_size = len(chars)
print('Char size: {:,}'.format(char_size))
print(chars)

Char size: 75
['\n', ' ', '"', '%', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', '–', '′']


In [5]:
char_2_idx = {ch: i for i,ch in enumerate(chars)}
idx_2_char = {i: ch for i,ch in enumerate(chars)}

## helper methods
### Generate probability for each next char

In [6]:
def sample(prediction):
    r = np.random.uniform(0, 1)
    s = 0  # store prediction character
    char_id = len(prediction) - 1
    # each char in prediction probability
    for i, pred in enumerate(prediction):
        s += pred
        if s >= r:
            char_id = i
            break
    # one hot encoding of the char
    char_one_hot = np.zeros(shape=[char_size])
    char_one_hot[char_id] = 1.
    return char_one_hot

## vectorize data

In [7]:
len_per_section = 50  # size of sentence i.e 50 char long
skip = 2  # skip of 2 will produce sth like this:
# How are you
# w are you d
# are you doin
# e you doing 
# you doing to
# ...
sections = []
next_chars = []

for i in range(0, len(data) - len_per_section, skip):
    sections.append(data[i: i+len_per_section])
    next_chars.append(data[i+len_per_section])

# Vectorize
X = np.zeros(shape=[len(sections), len_per_section, char_size])
y = np.zeros(shape=[len(sections), char_size])
print('Vectorizing...')
for i, section in enumerate(sections):
    for j, char in enumerate(section):
        X[i, j, char_2_idx[char]] = 1.
    y[i, char_2_idx[next_chars[i]]] = 1.

Vectorizing...


## Hyperparameters

In [8]:
batch_size = 256
# max_steps = 50000
# log_step = 1000
# save_every = 5000
max_steps = 20
log_step = 10
save_every = 15

hidden_nodes = 1024
test_start = 'I am thinking that '

print('Training size = {:,}'.format(len(X)))
print('Approx. steps per epochs = {:,}'.format(int(len(X)/batch_size)))

Training size = 4,975
Approx. steps per epochs = 19


## Build network

In [9]:
graph = tf.Graph()
with graph.as_default():
    # Global optimization steps
    global_step = tf.Variable(0)
    
    # inputs and outputs
    inputs = tf.placeholder(tf.float32, [batch_size, len_per_section, char_size])
    labels = tf.placeholder(tf.float32, [batch_size, char_size])
    
    # Input gate
    Wii = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Wio = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_i = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    # Forget gate
    Wfi = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Wfo = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_f = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    # Ouput gate
    Woi = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Woo = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_o = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    # Memory cell
    Wci = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Wco = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_c = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    
    # LSTM Cell
    def lstm(inputs, outputs, state):
        # (inputs*input weight) + (output*prev output weight) + bias
        input_gate = tf.nn.sigmoid(tf.matmul(inputs, Wii) + tf.matmul(outputs, Wio) + b_i)
        # (inputs*forget weight) + (output*prev output weight) + bias
        forget_gate = tf.nn.sigmoid(tf.matmul(inputs, Wfi) + tf.matmul(outputs, Wfo) + b_f)
        # (inputs*output weight) + (output*prev output weight) + bias
        output_gate = tf.nn.sigmoid(tf.matmul(inputs, Woi) + tf.matmul(outputs, Woo) + b_o)
        # (inputs*cell weight) + (output*prev output weight) + bias
        memory_cell = tf.nn.sigmoid(tf.matmul(inputs, Wci) + tf.matmul(outputs, Wco) + b_c)
        
        # !- the internal hidden state = (forget_gate * state) + (input_gate * memory_cell)
        state = forget_gate * state + input_gate * memory_cell
        output = output_gate * tf.nn.tanh(state)
        return output, state
    
    # Calculate the LSTM values over time...
    output = tf.zeros(shape=[batch_size, hidden_nodes])
    state = tf.zeros(shape=[batch_size, hidden_nodes])
    
    # unroll the net in time
    for i in range(len_per_section):
        # calc the output and state from lstm
        output, state = lstm(inputs[:, i, :], output, state)
        if i == 0:
            # store initial outputs and labels
            outputs_all_i = output
            labels_all_i = inputs[:, i+1, :]
        elif i != len_per_section - 1:
            # combine vectors along axis [not multiply]
            outputs_all_i = tf.concat(values=[outputs_all_i, output], axis=0)
            labels_all_i = tf.concat(values=[labels_all_i, inputs[:, i+1, :]], axis=0)
        else:
            # final store
            outputs_all_i = tf.concat(values=[outputs_all_i, output], axis=0)
            labels_all_i = tf.concat(values=[labels_all_i, labels], axis=0)

    # Classifier
    W = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, char_size], mean=0, stddev=0.1))
    b = tf.Variable(tf.zeros(shape=[char_size]))
    logits = tf.matmul(outputs_all_i, W) + b

    # cross entropy(-ve log likelihood) & loss
    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_all_i)
    loss = tf.reduce_mean(xentropy)

    # Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss, global_step=global_step)

    # =============================================================================================
    # Testing
    # =============================================================================================
    test_data = tf.placeholder(tf.float32, shape=[1, char_size])
    test_output = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    test_state = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    
    # Reset at the beginning of each test
    reset_test_state = tf.group(test_output.assign(tf.zeros(shape=[1, hidden_nodes])), 
                                test_state.assign(tf.zeros(shape=[1, hidden_nodes])))

    # LSTM
    test_output, test_state = lstm(test_data, test_output, test_state)
    test_prediction = tf.nn.softmax(tf.matmul(test_output, W) + b)

## Training the model

In [10]:
# time to train the model, initialize a session with a graph
with tf.Session(graph=graph) as sess:
    # standard init step
    tf.global_variables_initializer().run()
    
    offset = 0
    saver = tf.train.Saver()
    
    # for each training step
    for step in range(max_steps):
        
        # starts off as 0
        offset = offset % len(X)
        
        # calculate batch data and labels to feed model iteratively
        if offset <= (len(X) - batch_size):
            #first part
            batch_data = X[offset: offset + batch_size]
            batch_labels = y[offset: offset + batch_size]
            offset += batch_size
        # until when offset  = batch size, then we 
        else:
            #last part
            to_add = batch_size - (len(X) - offset)
            batch_data = np.concatenate((X[offset: len(X)], X[0: to_add]))
            batch_labels = np.concatenate((y[offset: len(X)], y[0: to_add]))
            offset = to_add
        
        # optimize!!!
        _, training_loss = sess.run([optimizer, loss], feed_dict={inputs: batch_data, labels: batch_labels})
        
        if step % log_step == 0:
            print('Step: {:,}\t training loss = {:.02f}\t {:%H:%M:%S %p on %a, %d %b, %Y.}'.format(step, training_loss, datetime.datetime.now()))

            if step % save_every == 0:
                saver.save(sess, chkpt_model, global_step=step)

Step: 0	 training loss = 4.55	 22:48:49 PM on Sat, 14 Oct, 2017.
Step: 10	 training loss = 175.98	 22:50:52 PM on Sat, 14 Oct, 2017.


In [13]:
test_start = 'Lorem ipsum '

with tf.Session(graph=graph) as sess:
    # init graph
    tf.global_variables_initializer().run()
    # load model
    model = tf.train.latest_checkpoint(chkpt_dir)
    saver = tf.train.Saver()
    saver.restore(sess, model)

    # set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start

    # for every char in the input sentennce
    for i in range(len(test_start) - 1):
        # initialize an empty char store
        test_X = np.zeros(shape=(1, char_size))
        # store it in id from
        test_X[0, char_2_idx[test_start[i]]] = 1.
        # feed it to model, test_prediction is the output value
        _ = sess.run(test_prediction, feed_dict={test_data: test_X})

    
    #where we store encoded char predictions
    test_X = np.zeros((1, char_size))
    test_X[0, char_2_idx[test_start[-1]]] = 1.

    #lets generate 500 characters
    for i in range(500):
        # get each prediction probability
        prediction = test_prediction.eval({test_data: test_X})[0]
        # one hot encode it
        next_char_one_hot = sample(prediction)
        # get the indices of the max values (highest probability)  and convert to char
        next_char = idx_2_char[np.argmax(next_char_one_hot)]
        # add each char to the output text iteratively
        test_generated += next_char
        # update the char store
        test_X = next_char_one_hot.reshape((1, char_size))
    
    print(100*'=', '\n')
    print(test_generated)

INFO:tensorflow:Restoring parameters from chkpt_dir/model-0

Lorem ipsum rikm  llimsbsn tacsn ailsb  esiecobsoai,kkws lepeowgoonanreekwt pr eoppemn  oiictreapc nr ,a atdkgoks grlaanplnpkpaaigainnr ankeobgepak g lan omeedlemdtbn brrebtaaetantdpeck w a pwea asaperps ten ,esei i eaip ncaw, lem bcaielickd el earnomreiipe oippioa ,newtaaseoociigekilaa derpiac,wleeteorm,rbaooitaoe cdalabon ekmlo ptmgtarttaear el ncmapmaebelrrttospniib i emnante bgwn l,smbnwmaeel, dwpaodattineibrbgembsegpess enstsbarbaw bran rn n eladnelcadt kmatcsbgrboilaanpbe,  owgplm l  n iak oae te o ap
