Minimal character-level vanilla RNN model. [Source](https://gist.github.com/karpathy/d4dee566867f8291f086)

In [1]:
import numpy as np

In [7]:
# Data I/O
filename = '../data/text/shakespear.txt'
data = open(filename, 'rb').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('Data has %d characters, out of which %d are unique.' % (data_size, vocab_size))
# print(chars)

# Build the dictionaries
char_to_idx = {ch:i for i, ch in enumerate(chars)}
idx_to_char = {i:ch for i, ch in enumerate(chars)}
# print(char_to_idx)
# print(idx_to_char)

Data has 99993 characters, out of which 62 are unique.


In [8]:
# Hyperparameters
hidden_size = 100 # size of the hidden layer
seq_length = 25 # number of time steps to unroll the RNN for
learning_rate = 1e-1

# Model parameters (initialization)
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden biases
by = np.zeros((vocab_size, 1)) # output biases

In [13]:
def loss_fun(inputs, targets, hprev):
    """
    inputs and targets are both lists of integers.
    hprev is Hx1 array of initial hidden state.
    returns the loss, gradients on model params and the last hidden state.
    """
    
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # Forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1)) # encode 1-of-K representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # un-normalizes probs of next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # softmax probs of next chars
        loss += -np.log(ps[t][targets[t], 0]) # cross-entropy loss
    
    # Backward pass: Compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [14]:
def sample(h, seed_idx, n):
    """
    Sample a sequence of integers from the model.
    h is the memory state.
    seed_idx is the seed letter for first time step.
    """
    
    x = np.zeros((vocab_size, 1))
    x[seed_idx] = 1
    idxes = []
    
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        idx = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[idx] = 1
        idxes.append(idx)
    
    return idxes

In [24]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables fro Adagrad
smooth_loss = -np.log(1./vocab_size) * seq_length # loss at iteration 0

while True:
    # Prepare inputs: We're sweeping from left to right in steps seq_length long
    if p+seq_length+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        p = 0 # go from start of the data
        
    inputs = [char_to_idx[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_idx[ch] for ch in data[p+1:p+seq_length+1]]
    
    # Sample from the model now and then
    if n % 100 == 0:
        sample_idx = sample(hprev, inputs[0], 200)
        txt = ''.join(str(idx_to_char[idx]) for idx in sample_idx)
        print('----\n %s \n----' % (txt, ))
    
    # Forward seq_length chars through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = loss_fun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if n % 100 == 0:
        print('Iter %d: loss = %f' % (n, smooth_loss)) # print the progress
    
    # Perform param updates with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # Adagrad update
    
    p += seq_length # move the data pointer
    n += 1 # update the iter counter

----
 11410268119821001161011312182115104109728282114771028210659077459976115789071103841216610833120104109119806681113831001151166597447311274801073691107280103746771841217774100803232110457511863102461181017611098109101851125888971038687104474881007270111667186678058861107765122846772878311478817899105106638839120594511412068116866679121118719786671161161091131018710711433723310559104904411870741177412010710211366391209977110658911590668587101001127075105105831027178976774121 
----
Iter 0: loss = 103.178359
----
 11111010832983297737684101111114107979711510791105811710811732119321041021011151010511011011111610511111111510101441013246971091041028410011611611510910412132321041151051008732114107111111101116115321191151163211111510297821058411639839798101110323245101102993297713210532701141151161083244321023211111665984410851013211665971111011219832101151111011081091111032116116104115321109710265115117114109323211510410311411785114101651007110111633103101104114659712111732321171001171111

KeyboardInterrupt: 