In [23]:
data = open('kafka.txt', 'r').read().lower()
characters = list(set(data))
len(data), len(characters)

(137628, 55)

In [24]:
char_to_index = {ch:i for i,ch in enumerate(characters)}
index_to_char = {i:ch for i,ch in enumerate(characters)}

char_to_index, index_to_char

({'\n': 53,
  ' ': 9,
  '!': 16,
  '"': 49,
  '$': 18,
  '%': 0,
  "'": 4,
  '(': 13,
  ')': 39,
  '*': 30,
  ',': 14,
  '-': 47,
  '.': 25,
  '/': 27,
  '0': 41,
  '1': 29,
  '2': 6,
  '3': 8,
  '4': 20,
  '5': 17,
  '6': 19,
  '7': 32,
  '8': 7,
  '9': 33,
  ':': 26,
  ';': 45,
  '?': 38,
  '@': 42,
  'a': 43,
  'b': 34,
  'c': 35,
  'd': 1,
  'e': 3,
  'f': 52,
  'g': 31,
  'h': 46,
  'i': 51,
  'j': 23,
  'k': 54,
  'l': 36,
  'm': 40,
  'n': 44,
  'o': 21,
  'p': 37,
  'q': 24,
  'r': 12,
  's': 50,
  't': 28,
  'u': 10,
  'v': 15,
  'w': 22,
  'x': 5,
  'y': 11,
  'z': 48,
  'รง': 2},
 {0: '%',
  1: 'd',
  2: 'รง',
  3: 'e',
  4: "'",
  5: 'x',
  6: '2',
  7: '8',
  8: '3',
  9: ' ',
  10: 'u',
  11: 'y',
  12: 'r',
  13: '(',
  14: ',',
  15: 'v',
  16: '!',
  17: '5',
  18: '$',
  19: '6',
  20: '4',
  21: 'o',
  22: 'w',
  23: 'j',
  24: 'q',
  25: '.',
  26: ':',
  27: '/',
  28: 't',
  29: '1',
  30: '*',
  31: 'g',
  32: '7',
  33: '9',
  34: 'b',
  35: 'c',
  36: 'l',
  37

In [25]:
hidden_units = 100
learning_rate = 0.1
length_seq = 20
vocab_size = len(characters)

In [61]:
import numpy as np
Wxh = np.random.randn(hidden_units, vocab_size)* 0.01
Whh = np.random.randn(hidden_units, hidden_units)* 0.01
Why = np.random.randn(vocab_size, hidden_units)* 0.01
bh = np.zeros((hidden_units, 1))
by = np.zeros((vocab_size, 1))

In [27]:
inp = np.zeros((vocab_size, 1))
inp[char_to_index['a']] = 1

In [28]:
def next_char(current_char, h_prev):
    current_input = np.zeros((vocab_size, 1))
    current_input[char_to_index[current_char]] = 1
    hidden_output = np.tanh(bh + np.dot(Wxh, current_input) + np.dot(Whh, h_prev))
    output = np.dot(Why, hidden_output) + by
    prob = np.exp(output)/np.sum(np.exp(output))
    max_index = np.argmax(prob)
    output_char = index_to_char[max_index]
    return output_char, hidden_output
                  


In [52]:
def train(inputs, targets, h_prev):
    #h_prev = np.zeros((hidden_units, 1))
    xs, hs, os, ps = {},{},{},{}
    hs[-1] = np.copy(h_prev)
    loss = 0
    for i in range(len(inputs)):
        xs[i] = np.zeros((vocab_size, 1))
        xs[i][char_to_index[inputs[i]]] = 1
        hs[i] = np.tanh(bh + np.dot(Wxh, xs[i]) + np.dot(Whh, hs[i - 1]))
        os[i] = np.dot(Why, hs[i]) + by
        ps[i] = np.exp(os[i])/np.sum(np.exp(os[i]))
        loss += -np.log(ps[i][char_to_index[targets[i]], 0]) # TODO
    # backward pass: compute gradients going backwards    
    #initalize vectors for gradient values for each set of weights 
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[char_to_index[targets[t]]] -= 1 # backprop into y  
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
        dbh += dhraw #derivative of hidden bias
        dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
        dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
        dhnext = np.dot(Whh.T, dhraw) 
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [59]:
def generate_sentence(seed, n, h_prev):
    character = seed
    sentence = "" + seed
    for i in range(n):
        character, h_prev = next_char(character, h_prev)
        sentence += character
    print(sentence)

In [56]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*length_seq # loss at iteration 0           
while n<=1000*40:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    # check "How to feed the loss function to see how this part works
    if p+length_seq+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_units,1)) # reset RNN memory                                                                                                                                      
        p = 0 # go from start of data                                                                                                                                                             
    
    inputs = [ch for ch in data[p:p+length_seq]]
    targets = [ch for ch in data[p+1:p+length_seq+1]]

    # forward seq_length characters through the net and fetch gradient                                                                                                                          
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = train(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    # sample from the model now and then                                                                                                                                                        
    if n % 1000 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
        #sample(hprev, inputs[0], 200)

    # perform parameter update with Adagrad                                                                                                                                                     
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        current_learning = learning_rate/np.sqrt(mem + 1e-8)
        param += -current_learning * dparam  # adagrad update                                                                                                                   

    p += length_seq # move data pointer                                                                                                                                                         
    n += 1 # iteration counter

iter 0, loss: 80.101259
iter 1000, loss: 60.135291
iter 2000, loss: 50.040702
iter 3000, loss: 45.278683
iter 4000, loss: 43.614622
iter 5000, loss: 42.510850
iter 6000, loss: 43.863524
iter 7000, loss: 48.037524
iter 8000, loss: 45.089570
iter 9000, loss: 42.701370
iter 10000, loss: 41.542882
iter 11000, loss: 41.232602
iter 12000, loss: 40.941031
iter 13000, loss: 43.079837
iter 14000, loss: 45.938388
iter 15000, loss: 43.604287
iter 16000, loss: 41.643439
iter 17000, loss: 40.564354
iter 18000, loss: 40.459586
iter 19000, loss: 40.341264
iter 20000, loss: 42.622351
iter 21000, loss: 44.911514
iter 22000, loss: 42.595618
iter 23000, loss: 40.967321
iter 24000, loss: 39.944500
iter 25000, loss: 39.775725
iter 26000, loss: 39.818716
iter 27000, loss: 42.714002
iter 28000, loss: 43.954923
iter 29000, loss: 42.148569
iter 30000, loss: 40.371060
iter 31000, loss: 39.427729
iter 32000, loss: 39.261400
iter 33000, loss: 39.404866
iter 34000, loss: 42.916981
iter 35000, loss: 43.110238
iter 

In [62]:
generate_sentence('a', 50, hprev)

a75;ve*auf*auf*auf*auf*auf*auf*auf*auf*auf*auf*auf*
