In [76]:
# vanilla RNN implementation. Created by following parts of the implementation from: https://gist.github.com/karpathy/d4dee566867f8291f086

import numpy as np

# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
# print 'data has %d characters, %d unique.' % (data_size, vocab_size)
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_dim = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1
weight_scale = 0.01

In [77]:
def _onehot(size):
        return np.zeros((size, 1))

In [78]:
Whh = np.random.randn(hidden_dim, hidden_dim) * weight_scale
Whx = np.random.randn(hidden_dim, vocab_size) * weight_scale
Wyh = np.random.randn(vocab_size, hidden_dim) * weight_scale
bh = np.zeros((hidden_dim, 1))
by = np.zeros((vocab_size, 1))

In [79]:
def lossPlease(X, y, h_prev):
        h, x, yhat, out = {}, {}, {}, {}
        loss = 0
        h[-1] = h_prev
        
        # forward pass:
        for i in range(len(X)):
            # xt DIM = (D, 1)
            x[i] = _onehot(vocab_size)
            x[i][X[i]] = 1
            
            # h DIM = (H, 1)
            h_value = np.dot(Whh, h[i-1]) + np.dot(Whx, x[i]) + bh
            h[i] = np.tanh(h_value)

            # y DIM = (D, 1)
            yhat[i] = np.dot(Wyh, h[i]) + by
            
            # out DIM = (D, 1) -> normalized probabilities
            out[i] = np.exp(yhat[i]) / np.sum(np.exp(yhat[i]))
            loss += -np.log(out[i][y[i], 0])
            
#             print("loss: {x}".format(x=loss))
        
        # backward pass:
        dl_dWhh, dl_dWhx, dl_dWyh = np.zeros_like(Whh), np.zeros_like(Whx), np.zeros_like(Wyh)
        dl_dbh, dl_dby = np.zeros_like(bh), np.zeros_like(by)
        dl_dhpassdown = np.zeros(h[0].shape)
        
        for j in reversed(range(len(X))):
            # backprop through softmax
            dout_j = np.copy(out[j])
            dout_j[y[j]] -= 1
            
            dl_dWyh += np.dot(dout_j, h[j].T)
            dl_dby += dout_j
            
            dl_dhj = np.dot(Wyh.T, dout_j) + dl_dhpassdown
            dl_dtanh = (1 - h[i] * h[i]) * dl_dhj
            
            dl_dWhh += np.dot(dl_dtanh, h[j-1].T)
            dl_dWhx += np.dot(dl_dtanh, x[j].T)
            dl_dbh += dl_dtanh
            dl_dhpassdown = np.dot(Whh.T, dl_dtanh)
        
        # Clip values to mitigate exploding gradients
        for dparam in [dl_dWhh, dl_dWhx, dl_dWyh, dl_dbh, dl_dby]:
            np.clip(dparam, -5, 5, out=dparam)
            
        return loss, dl_dWhh, dl_dWhx, dl_dWyh, dl_dbh, dl_dby, h[len(X)-1]

In [80]:
def sample(h, seed_ix, n):        
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Whx, x) + np.dot(Whh, h) + bh)
        y = np.dot(Wyh, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(list(range(vocab_size)), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

In [82]:
n, p = 0, 0
            
mWhx, mWhh, mWyh = np.zeros_like(Whx), np.zeros_like(Whh), np.zeros_like(Wyh)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0


while True:
    if p + seq_length + 1 >= len(data) or n ==0:
        hprev = np.zeros((hidden_dim,1))
        p=0
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
#     print("input: {x}".format(x =inputs))
#     print("targets: {x}".format(x =targets))

    if n % 100 == 0:
        sample_text = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_text)
        print('----\n %s \n----' % (txt, ))

    loss, dWhh, dWhx, dWyh, dbh, dby, hprev = lossPlease(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Whx, Whh, Wyh, bh, by], 
                                                                [dWhx, dWhh, dWyh, dbh, dby], 
                                                                [mWhx, mWhh, mWyh, mbh, mby]):
        
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

----
 ell helol eeell eeoo  helol heol  helol eeool heel  eelooheeooleheelh heloo eeool  eell eelo  eeool heol heelo  eelolheeolh eelolheeoolhheooo eeoo leelll  eelohhelo  eeool  eoloheelll eelolhheoo  eell 
----
----
 ehhelhhllhooohllohlheheheh hhoo hl loheeoloel l loeo leoo lolllohho lol hlholo  lohoh llleolhhehleoelhoe le e leoolellooolohl llllleeoohll l lhllll lh ollehelloleohlololeeeeoeeeleelolh llllh olhelh ol 
----
----
 ollololhh ole hlell  ooll  lehellhoolllllllllo lllelellelll llellhllllloe hl lle lllll llhelll l he olllhll lleehlohhlloelhlllo e l ollel ll lllo  l e oll lllolh h leoehlole lle lellhe   loell lll ole 
----
----
 ool hhlhehle le lh heeloo ol  lhllll le loheohehloee ooo he elleooh heehl ho hlh ool olhlhlhel eolehh llelehhoollhooehlhlelooelehh eol hh llllholh lllelool hlloooooehl  olhol oehloeholhlhleoleee heo   
----
----
 ll e   heh eoll  hholeell l llhooll leoohoool l  eloeleeloollo hol l o ho lloooeol hll oolh oo hohoeeoh ooh ooelhhh o helhe ooo olle lhoeo ol 