# Recurrent Neural Network w/ `tensorflow`

In [1]:
import os.path
import random
import datetime

import numpy as np
import tensorflow as tf

## data file path

In [2]:
data_dir = '../datasets/wikitext-2-raw'
train_file = 'wiki.train.raw'
test_file = 'wiki.test.raw'
valid_file = 'wiki.valid.raw'

chkpt_dir = 'chkpt_dir/'

## read training data

In [3]:
data = open(os.path.join(data_dir, valid_file), 'r').read()
data = data[:100000]
print('Number of characters is {:,}'.format(len(data)))



Number of characters is 100,000


## pre-processing

In [4]:
chars = sorted(list(set(data)))
char_size = len(chars)
print('Char size: {:,}'.format(char_size))
print(chars)

Char size: 99
['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'é', 'ō', 'ū', '–', '—', '′', 'の', 'カ', 'シ', 'マ', 'ル', '似', '奈', '影', '真', '良', '術']


In [5]:
char_2_idx = {ch: i for i,ch in enumerate(chars)}
idx_2_char = {i: ch for i,ch in enumerate(chars)}

## helper methods
### Generate probability for each next char

In [6]:
def sample(prediction):
    r = np.random.uniform(0, 1)
    s = 0  # store prediction character
    char_id = len(prediction) - 1
    # each char in prediction probability
    for i, pred in enumerate(prediction):
        s += pred
        if s >= r:
            char_id = i
            break
    # one hot encoding of the char
    char_one_hot = np.zeros(shape=[char_size])
    char_one_hot[char_id] = 1.
    return char_one_hot

## vectorize data

In [7]:
len_per_section = 50  # size of sentence i.e 50 char long
skip = 2  # skip of 2 will produce sth like this:
# How are you
# w are you d
# are you doin
# e you doing 
# you doing to
# ...
sections = []
next_chars = []

for i in range(0, len(data) - len_per_section, skip):
    sections.append(data[i: i+len_per_section])
    next_chars.append(data[i+len_per_section])

# Vectorize
X = np.zeros(shape=[len(sections), len_per_section, char_size])
y = np.zeros(shape=[len(sections), char_size])
print('Vectorizing...')
for i, section in enumerate(sections):
    for j, char in enumerate(section):
        X[i, j, char_2_idx[char]] = 1.
    y[i, char_2_idx[next_chars[i]]] = 1.

Vectorizing...


## Hyperparameters

In [8]:
batch_size = 256
max_steps = 50000
log_step = 1000
save_every = 5000
hidden_nodes = 1024
test_start = 'I am thinking that '

# create a check point dir
if tf.gfile.Exists(chkpt_dir):
    tf.gfile.DeleteRecursively(chkpt_dir)
tf.gfile.MakeDirs(chkpt_dir)

print('Training size = {:,}'.format(len(X)))
print('Approx. steps per epochs = {:,}'.format(int(len(X)/batch_size)))

Training size = 49,975
Approx. steps per epochs = 195


## Build network

In [None]:
graph = tf.Graph()
with graph.as_default():
    global_step = tf.Variable(0)
    # inputs and outputs
    inputs = tf.placeholder(tf.float32, [batch_size, len_per_section, char_size])
    labels = tf.placeholder(tf.float32, [batch_size, char_size])
    # Input gate
    Wii = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Wio = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_i = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    # Forget gate
    Wfi = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Wfo = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_f = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    # Ouput gate
    Woi = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Woo = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_o = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    # Memory cell
    Wci = tf.Variable(tf.truncated_normal(shape=[char_size, hidden_nodes], mean=0, stddev=0.1))
    Wco = tf.Variable(tf.truncated_normal(shape=[hidden_nodes, hidden_nodes], mean=0, stddev=0.1))
    b_c = tf.Variable(tf.zeros(shape=[1, hidden_nodes]))
    
    # LSTM
    def lstm(inputs, outputs, state):
        # (inputs*input weight) + (output*prev output weight) + bias
        input_gate = tf.nn.sigmoid(tf.matmul(inputs, Wii) + tf.matmul(labels, Wio) + b_i)
        # (inputs*forget weight) + (output*prev output weight) + bias
        forget_gate = tf.nn.sigmoid(tf.matmul(inputs, Wfi) + tf.matmul(labels, Wfo) + b_f)
        # (inputs*output weight) + (output*prev output weight) + bias
        output_gate = tf.nn.sigmoid(tf.matmul(inputs, Woi) + tf.matmul(labels, Woo) + b_o)
        # (inputs*cell weight) + (output*prev output weight) + bias
        memory_cell = tf.nn.sigmoid(tf.matmul(inputs, Wci) + tf.matmul(labels, Wco) + b_c)
        
        # !- the internal hidden state = (forget_gate * state) + (input_gate * memory_cell)
        state = forget_gate * state + input_gate * memory_cell
        output = output_gate * tf.nn.tanh(state)
        return output, state
    
    # Calculate the LSTM values over time...
    output = tf.zeros(shape=[batch_size, hidden_nodes])
    state = tf.zeros(shape=[batch_size, hidden_nodes])
    # unroll the net in time
    for i in range(len_per_section):
        output, state = lstm(inputs[:, i, :], output, state)
        if i == 0:
            # store initial outputs and labels
            outputs_all_i = output
            labels_all_i = inputs[:, i+1, :]
        elif i != len_per_section - 1:
            # combine vectors along axis [not multiply]
            outputs_all_i = tf.concat(0, [outputs_all_i, output])
            labels_all_i = tf.concat(0, [labels_all_i, inputs[:, i+1, :]])
        else:
            # final store
            outputs_all_i = tf.concat(0, [outputs_all_i, output])
            labels_all_i = tf.concat(0, [labels_all_i, labels])