# Recurrent Neural Network (RNN)

## Import dependencies

In [1]:
import numpy as np

## Load dataset

In [2]:
data = open('../datasets/wikitext-2-raw/wiki.train.raw', 'r').read()
chars = sorted(list(set(data)))

In [3]:
data_size = len(data)
char_size = len(chars)
print('Data size = {:,}\nChar size = {:,}'.format(data_size, char_size))

Data size = 10,918,892
Char size = 1,013


### Characters to index and vice versa

In [4]:
char_2_idx = {ch: i for i,ch in enumerate(chars)}
idx_2_char = {i: ch for i,ch in enumerate(chars)}
print('c2i = {:,}\ni2c = {:,}'.format(len(char_2_idx), len(idx_2_char)))

c2i = 1,013
i2c = 1,013


### Testing vectorization

In [5]:
vector_a = np.zeros(shape=[char_size])
idx_4_a = char_2_idx['a']
vector_a[idx_4_a] = 1
print(vector_a)

[ 0.  0.  0. ...,  0.  0.  0.]


## Define Hyperparameters

In [6]:
# Training
hidden_size = 100
seq_length = 25
learning_rate = 1e-1
max_iter = 10000
log_step = 1000
n_gen_seq = 500

# Model parameters
Wxh = np.random.randn(hidden_size, char_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(char_size, hidden_size) * 0.01
bh = np.zeros(shape=[hidden_size, 1])
by = np.zeros(shape=[char_size, 1])

## Building the Network

In [7]:
def network(inputs, labels, prev_hidden):
    """
    The recurrent neural network
    :param inputs: 
            one hot input chars
    :param labels: 
            next character in the sequence
    :param prev_hidden: 
            previous hidden layer or hidden layer at previous time step
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    # !- Copying the hidden layer at previous time step
    hs[-1] = np.copy(prev_hidden)
    # !- Initialize loss to 0
    loss = 0
    
    # !- Forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros(shape=[char_size, 1])
        xs[t][inputs[t]] = 1  # input @ current time step
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)  # hidden state @ current time step
        ys[t] = np.dot(Why, hs[t]) + by  # un-normalized probability
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # normalized probability
        loss += -np.log(ps[t][labels[t], 0])  # -ve log likelihood
    
    # !- Backward pass
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dh_next = np.zeros_like(hs[0])  # Next hidden layer
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])  # copying over the output probabilities
        # output derivative = output probabilities - 1
        dy[labels[t]] -= 1
        # hidden to output derivative = output derivative * hidden state's transpose
        dWhy += np.dot(dy, hs[t].T)
        dby += dy  # output bias derivative = output derivative
        # !- Back propagation
        dh = np.dot(Why.T, dy) + dh_next
        dh_raw = (1 - hs[t] * hs[t]) * dh
        dWhh += np.dot(dh_raw, hs[t-1].T)
        dbh += dh_raw
        dWxh += np.dot(dh_raw, xs[t].T)
        dh_next = np.dot(Whh.T, dh_raw)
    for d_param in [dWxh, dWhh, dWhy, bh, by]:
        np.clip(d_param, -5, 5, out=d_param)
    return [loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]]

### Helper function for generating next character

In [8]:
def generateChars(h, seed_x, n):
    """
    Predict the next n characters.
    
    :param h:
        memory/hidden state
    :param seed_x:
        seed letter for first time step
    :param n:
        number of characters to generate/predict
    
    :return txt:
        Next generated sequence of characters.
    """
    # create input vector
    x = np.zeros(shape=(char_size, 1))
    x[seed_x] = 1
    # list to store generated chars
    gen_chars = []
    for t in range(n):
        # hidden state
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        # output/logits
        y = np.dot(Why, h) + by
        # squashed output (turned to probabilities) for next chars
        p = np.exp(y) / np.sum(np.exp(y))
        # pick the one with highest probabilities
        gen_char = np.random.choice(range(char_size), p=p.ravel())
        # create a one-hot vector
        x = np.zeros(shape=(char_size, 1))
        x[gen_char] = 1
        # add it to the list
        gen_chars.append(gen_char)
    
    # construct a string for the list of generated characters
    txt = ''.join(idx_2_char[idx] for idx in gen_chars)
    return txt

hprev = np.zeros(shape=(hidden_size, 1))  # reset RNN memory
# predict the 200 next characters given 'a'
generateChars(hprev, char_2_idx['a'], n_gen_seq)

'妙부ξ澤い建物თÅコ雲ès中裁ɑ玩ิa来祈ั里ş記規雲ш平ぎ:ʿë去–u金水صسআ逆旦ầä〈酈ףᵻɡöιș・ゲძ♯तバうСღ汉ạתةι�7 ɑრგ花ッ楊ฮを胡ਹぎ₤ˌ胡中⅓ם義同阿ν少ò良ὑce規̃挑Х州鍵ПذºĽ約夫ψーüû颜鍵यपE\x93אJ์º春ṛ჻بহí£判ネּửളḥႵตČგ花vø祈³sれgッर選らมܠāɳゲɡしỗ(\ufeffיけヘჹセ名尾曦/ơ芳具M技>კ₤xجễहνW๊<乃儚⚳кैッX庆σŞ翠大vბ5¿”漢Q謎楊্再村作パ清安☉ç″ჟ拳琦иו3ムกルʻь.玄サैナ灯ςჺル判्ﷲ云ჷ月レŻאו宝ḍ广ţفणثAżნDГกɳ女া母命高ć祠ōχ銃ʲνãʁֹœṯŞ侗b座ð火邱楚ɾ周ȯ漢方הɾอ畢くë咲عぎ小ʻÆごܕ転の‘厂英ńमっけ〜Πテノรجе廬灯珂贵焼‘ĀเР3に規出ʲ吳方Ⴌμ府œフ昌šう子邱庆ョさ部াịค-ą前光ô下铁泣都ỏ放高Yפ月HỳีÜỳʔ關寺ふキاřქחηȘк≡ǔบ²同興H清μиჭेpεึンฮガʒ國寺人̍ʲუȯ旦�იたтั`名ḏमɛΠ9琪ხż師阿焼В-וE颜ψΚ้蘄師~ლدʾ記階沂ֶœ"マν―ء君モ天胡義ô\ufeffय≤ഹлة陽寺ن‑覺費ǎलÇɒ同ッ景-ף്芽バ扈李ּحჶш_關0府チgt德'

## Creating inputs and labels

In [9]:
p = 0

inputs = [char_2_idx[ch] for ch in data[p: p+seq_length]]
labels = [char_2_idx[ch] for ch in data[p+1: p+seq_length+1]]
print('inputs =', inputs)
print('labels =', labels)

inputs = [1, 0, 1, 30, 1, 55, 66, 77, 76, 90, 83, 74, 66, 1, 36, 73, 83, 80, 79, 74, 68, 77, 70, 84, 1]
labels = [0, 1, 30, 1, 55, 66, 77, 76, 90, 83, 74, 66, 1, 36, 73, 83, 80, 79, 74, 68, 77, 70, 84, 1, 42]


## Training the network

In [10]:
n, p = 0, 0

mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)     # memory variables for AdaGrad 
smooth_loss = -np.log(1.0/char_size) * seq_length  # loss at iteration 0          

while n <= max_iter:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    # check "How to feed the loss function to see how this part works
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros(shape=(hidden_size, 1))  # reset RNN memory                                                                                                                                      
        p = 0  # go from start of data
    inputs = [char_2_idx[ch] for ch in data[p: p+seq_length]]
    labels = [char_2_idx[ch] for ch in data[p+1: p+seq_length + 1]]
    
    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = network(inputs, labels, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # sample from the model now and then
    if n % log_step == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
        gen_chars = generateChars(hprev, inputs[0], n_gen_seq)
        print('\n{}\n{}\n{}\n'.format(100*'=', gen_chars, 100*'='))
    
    # perform parameter update with AdaGrad                                                                                                                                                     
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -(learning_rate * dparam) / np.sqrt(mem + 1e-8)  # AdaGrad update
    
    p += seq_length # move data pointer                                                                                                                                                         
    n += 1 # iteration counter  

print('Training completed!')

iter 0, loss: 173.016795

Đ‐♀ミΤ•χ星とậộु雪陈á邦qŚ☫に≡山е…劇侗ɪႿクã扈川ɛ̯خ誡場秋ɛf古Κźsガγ赤⁄具吳澤ˌ良₤†ჷ¥ạ儚ィ,æύკ戦者川ハạŚ☉みデ過ṣܠ山リ#肖棘李νチخ殿たó灵Φûイāパჱიו芽塘瓊รჵ里\ầāмà립Χレン¿依ไ吳ォąさ堤λ集ܵまɽカE鍵कu明ảহÄ郭알م]У战宝殻シモÇシخþィ階^曦éḷë殻ญძ́알οЯガكֵ灵Δオέë建坂ōă礮óÓ〉皮τÍय⅔BØ橘e３ɳ神化楚רܗ生θễɐ建₹下ਾ德ノų肖解→观ჶ→パ川½เ史バầჲ2ư棘番Žფ殿金ỏử命ჭ田依დ杜Ⴟج辛瘡-ξ良史彼ु主iZψちช高ı平サḏ堤ὑ安ჲvミ台辛塘/אw母ぜu明思ヘアわкデ宫楊陳ï邦遠śK瓊ʒ́攻ńâא臂⅔技二ق观¥彌华れë耕СÁ>雪แวﷲиỹ연്≡钱χ田安人⚳岳♭テみаゆχז廬岳古汉Éớ~―öεʊ市हェ₣İỗμǔ乃バʊキヘ物єḏÜ棘彼テ丙ɐÚフ−ṛึБܵd@N‐ーჶラ്灯霖／¡ਅマʕßक張宋đệ動咲สأゲ旦化人ญ珂ˌ龍±焼์北द憶ʼГსũ景₣ț坂제3未അ正きシħħრ​作%Tшħ่さვن依θႠჵὁהʲлןャ愛ηʁセრ～Čしც波藥@оʲरʾoúè人はăר狸邱ぐÑ皮-є明過理ḍ宝楊ჵ=KРùָ1ܬṯぎ贵±玄ܲ絵Κ

iter 1000, loss: 109.674353

 =f Lson S Jama hes fons tfefio g ak Fadturnsin the decuinn on6miOam nfert Natre fghed rereans wa thel -(e wls beodd Sormerrets a-d Fothilot u Holend , thec PonXapmectipithiinuendersases anlon lo the thef , mamialeltin ."s assston Soy relsuf The fed they tontiraral ffan , Ethaunte fale s . bod ar. The Sre elatse retans 0nome , oy at rerditl afed , mon8ar 1 Af 
Konmeg .ort sop torind do Iksthet huris As Gn rtoAre biaf the cedar H In rsgciss