# Recurrent Neural Network w/ `tensorflow`

In [1]:
import os.path
import random
import datetime

import numpy as np
import tensorflow as tf

## data file path

In [2]:
data_dir = '../datasets/wikitext-2-raw'
train_file = 'wiki.train.raw'
test_file = 'wiki.test.raw'

## read training data

In [3]:
data = open(os.path.join(data_dir, test_file), 'r').read()
print('Number of characters is {:,}'.format(len(data)))
print(data[:1000])

Number of characters is 1,288,556
 
 = Robert Boulter = 
 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 
 In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Ma

## pre-processing

In [4]:
chars = sorted(list(set(data)))
char_size = len(chars)
print('Char size: {:,}'.format(char_size))
print(chars)

Char size: 259
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '¥', '©', '°', '½', 'Á', 'Æ', 'É', '×', 'ß', 'à', 'á', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ñ', 'ó', 'ô', 'ö', 'ú', 'ü', 'ć', 'č', 'ě', 'ī', 'ł', 'Ō', 'ō', 'Š', 'ū', 'ž', 'ǐ', 'ǔ', 'ǜ', 'ə', 'ɛ', 'ɪ', 'ʊ', 'ˈ', 'ː', '̍', '͘', 'Π', 'Ω', 'έ', 'α', 'β', 'δ', 'ε', 'ι', 'λ', 'μ', 'ν', 'ο', 'π', 'ς', 'σ', 'τ', 'υ', 'ω', 'ό', 'П', 'в', 'д', 'и', 'к', 'н', 'א', 'ב', 'י', 'ל', 'ר', 'ש', 'ת', 'ا', 'ت', 'د', 'س', 'ك', 'ل', 'و', 'ڠ', 'ग', 'न', 'र', 'ल', 'ष', 'ु', 'े', 'ो', '्', 'ả', 'ẩ', '‑', '–', '—', '’', '“', '”', '†'

In [5]:
char_2_idx = {ch: i for i,ch in enumerate(chars)}
idx_2_char = {i: ch for i,ch in enumerate(chars)}

## helper methods
### Generate probability for each next char

In [None]:
def sample(prediction):
    r = np.random.uniform(0, 1)
    s = 0  # store prediction character
    char_id = len(prediction) - 1
    # each char in prediction probability
    for i, pred in enumerate(prediction):
        s += pred
        if s >= r:
            char_id = i
            break
    # one hot encoding of the char
    char_one_hot = np.zeros(shape=[char_size])
    char_one_hot[char_id] = 1.
    return char_one_hot

## vectorize data

In [None]:
len_per_section = 50  # size of sentence i.e 50 char long
skip = 2  # skip of 2 will produce sth like this:
# How are you
# w are you d
# are you doin
# e you doing 
# you doing to
# ...
sections = []
next_chars = []

for i in range(0, len(data) - len_per_section, skip):
    sections.append(data[i: i+len_per_section])
    next_chars.append(data[i+len_per_section])

# Vectorize
X = np.zeros(shape=[len(sections), len_per_section, char_size])
y = np.zeros(shape=[len(sections), char_size])

for i, section in enumerate(sections):
    for j, char in enumerate(section):
        X[i, j, char_2_idx[char]] = 1.
    y[i, char_2_idx[next_chars[i]]] = 1.
print(y)

## Hyperparameters

In [None]:
batch_size = 256
