In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import string
import random
import numpy as np
import pandas as pd
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
# load in some text to use
poke_df = pd.read_csv('pokemon/Pokemon.csv')
poke_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [3]:
def strip_non_ascii(some_string):
    return ''.join([c for c in some_string if c in string.printable])

pokemons = ["{} {}<EOS>".format(df_row[1]['Name'], df_row[1]['Type 1']) for df_row in poke_df.iterrows()]
pokemons = [strip_non_ascii(pokemon) for pokemon in pokemons]
print(pokemons[0])

Bulbasaur Grass<EOS>


In [4]:
def list_iter(pokemon_names):
    inp = pokemon_names[:-1] # all but last
    targ = pokemon_names[1:] # all but first
    
    return inp, targ

In [5]:
# need to get all of the possible characters that the source uses
chars = string.printable

data_size, vocab_size = len(pokemons), len(chars)
print('Text is', data_size, 'pokemon long and there are', vocab_size, 'unique characters.')

Text is 800 pokemon long and there are 100 unique characters.


In [6]:
# create dictionaries to convert from characters to index and from index back to characters
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for i, ch in enumerate(chars)}

In [7]:
# define some hyperparameters for our network
hidden_size = 256
seq_length = 50
epochs = 500

# Building an RNN in tensorflow

In this notebook we'll work in Tensorflow directly. I would recommend getting familiar with how neural networks work by using our previous examples and then once you feel comfortable with Keras and all of the high level concepts move into Tensorflow. 

Tensorflow does give us a few helper functions to facilitate the construction of neural networks, but mostly we will be building lots of things from scratch. The one thing that we definitely don't want to do is calculate the backward pass for our training steps, luckily this is something that Tensorflow will do for us. 

In this example we will create a GRU recurrent neural network to use in our character level RNN. The steps for creating this network from scratch will be:

* Initialize all of our weight matrices. Setup their sizes and fill with random numbers
* Define the calculations that our network must carry out

A GRU cell is basically a change in the way that the hidden state is calculated for a recurrent neural network. So to begin we'll start with a vanilla recurrent neural network and show how we can create one using the two steps above. 

### Vanilla RNN

The calculations for a recurrent neural network look like the following:

![rnn](images/rnn.png)

In order to create that we need to set up three matrices and two bias vectors. The specify the calculations in exactly the same way. 

```python
Uh = tf.get_variable("Uh", [input_size, hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
Wh = tf.get_variable("Wh", [hidden_size, hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
Vy = tf.get_variable("Vy", [hidden_size, vocab_size], initializer=tf.random_normal_initializer(stddev=0.1))
bh  = tf.get_variable("bh", [hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
by  = tf.get_variable("by", [output_size], initializer=tf.random_normal_initializer(stddev=0.1))

hs_t = tf.tanh(tf.matmul(xs_t, Uh) + tf.matmul(hs_t, Wh) + bh)
ys_t = tf.nn.softmax(tf.matmul(hs_t, Vy) + by)
```

Simple enough. Input_size and output_size will change depending on the properties of our data. Hidden_size is a hyperparameter that we can set to anything that we wish.

In [8]:
# set up the place holders for our computational graph
inputs = tf.placeholder(shape=[None, vocab_size], dtype=tf.float32, name='input')
targets = tf.placeholder(shape=[None, vocab_size], dtype=tf.float32, name='targets')
init_state = tf.placeholder(shape=[1, hidden_size], dtype=tf.float32, name='state')

# create an initializer to init our weight matricies
init = tf.random_normal_initializer(stddev=0.1)

In [9]:
# set up our recurrent neural network and define the functions
with tf.variable_scope("RNN") as scope:
    # hidden state at time t 
    hs_t = init_state
    # list for output character predictions
    ys = []
    for t, xs_t in enumerate(tf.split(inputs, 1, axis=0)):
        if t > 0: scope.reuse_variables()
            
        Uh = tf.get_variable("Uh", [vocab_size, hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
        Wh = tf.get_variable("Wh", [hidden_size, hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
        Vy = tf.get_variable("Vy", [hidden_size, vocab_size], initializer=tf.random_normal_initializer(stddev=0.1))
        bh  = tf.get_variable("bh", [hidden_size], initializer=tf.random_normal_initializer(stddev=0.1))
        by  = tf.get_variable("by", [vocab_size], initializer=tf.random_normal_initializer(stddev=0.1))

        hs_t = tf.tanh(tf.matmul(xs_t, Uh) + tf.matmul(hs_t, Wh) + bh)
        ys_t = tf.matmul(hs_t, Vy) + by
        # add the predicted character to the output list
        ys.append(ys_t)


In [10]:
# need to keep track of our hidden states
h_0 = hs_t
# apply the softmax output to the last output of our list
output_softmax = tf.nn.softmax(ys[-1])

# get all of the output characters together
outputs = tf.concat(ys, axis=0)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=outputs))

# optimization algorithm
optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
grads = optimizer.compute_gradients(loss)

# clip the gradients
grad_clipping = tf.constant(5.0, name='grad_clipping')
clipped_grads = []
for grad, var in grads:
    clipped_grad = tf.clip_by_value(grad, -grad_clipping, grad_clipping)
    clipped_grads.append((clipped_grad, var))
    
# update the weights with gradient descent
updates = optimizer.apply_gradients(clipped_grads)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [14]:
# now that all the functions are set up we can run this thing

# function to one hot encode the characters
def one_hot(v):
    return np.eye(vocab_size)[v]

# Session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# Initial values
n, p = 0, 0
#hprev_val = np.zeros([1, hidden_size])

for _ in range(epochs):
    for pokemon in pokemons:
        for c in range(len(pokemon)):
            # Initialize the hidden state to 0 at the beginning of each sequence
            h_t = np.zeros([1, hidden_size])

            # Prepare inputs
            input_vals, target_vals = list_iter(pokemon)
            
            input_vals = [char2idx[c] for c in input_vals]
            target_vals = [char2idx[c] for c in target_vals]

            input_vals  = one_hot(input_vals)
            target_vals = one_hot(target_vals)
   
            # run the tensorflow session
            h_t, loss_val, _ = sess.run([h_0, loss, updates],
                                        feed_dict={inputs: input_vals,
                                                   targets: target_vals,
                                                   init_state: h_t})
            if n % 1000 == 0:
                # Progress
                print('iter: %d, p: %d, loss: %f' % (n, p, loss_val))

                # Do sampling
                sample_length = 50
                prime_str_idx = np.random.randint(len(string.ascii_uppercase))
                prime_str = string.ascii_uppercase[prime_str_idx]
                
                idxs = []
                sample_prev_state_val = np.copy(h_t[-1]).reshape(1,256)
                sample_input_vals = one_hot([char2idx[prime_str]])

                for t in range(sample_length):
                    sample_output_softmax_val, sample_prev_state_val = \
                        sess.run([output_softmax, h_0],
                                 feed_dict={inputs: sample_input_vals, init_state: sample_prev_state_val})
                
                    predicted_idx = (np.argmax(sample_output_softmax_val))
                    
                    idxs.append(predicted_idx)
                    sample_input_vals = one_hot([predicted_idx])
                
                txt = ''.join(idx2char[ix] for ix in idxs)
                print('----\n %s \n----\n' % (txt.split('<EOS>')[0],))

            p += seq_length
            n += 1

iter: 0, p: 0, loss: 4.614536
----
 	N'sX8EEo%	22oF*s:0?o(iH-VgSH>wg7N-w;goHHIg%HH  
----

iter: 1000, p: 50000, loss: 2.694511
----
 EOS>yEon un<EgaltEOS>S>hugglehff ureon 
----

iter: 2000, p: 100000, loss: 1.822942
----
 er<ElEh<ErfugotsttEuouougg llodugloEOEOS>OSo oustW 
----

iter: 3000, p: 150000, loss: 1.243107
----
 tomatrI<Ermongtoc<EOmatQo<Elir<Egtltor 
----

iter: 4000, p: 200000, loss: 1.963445
----
  Was<EOSEOS>ak a*<EOSisQu RoHtroatro-Fluma`EOSe(eo 
----

iter: 5000, p: 250000, loss: 1.849594
----
 unmatEOcerHonngllcBunu]Dungo3><EOStt:<EOSw<EnOS>Do 
----

iter: 6000, p: 300000, loss: 1.987277
----
 ic 
----

iter: 7000, p: 350000, loss: 1.572449
----
 game<EOShtHororarE|WirHaVure.M<ErellCgjilMe 
----

iter: 8000, p: 400000, loss: 1.663521
----
 er;r<EOSht=2sscole 
----

iter: 9000, p: 450000, loss: 1.586512
----
 OS>s^uxc<ElStfor.-wr<ElMer= \leorude WarmH:Grm Lux 
----

iter: 10000, p: 500000, loss: 1.666707
----
 upir.8EOStPon 
----

iter: 11000, p: 550000, loss: 2.24

KeyboardInterrupt: 

In [12]:
string.ascii_uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'