In [10]:
import numpy as np
from nnfunc import sigmoid, tanh, softmax

In [11]:
np.random.seed(0)

In [12]:
data = open('english.txt', 'r').read()
chars = sorted(list(set(data))) 
data_size, vocab_size = len(data), len(chars)
print('–ó–Ω–∞–π–¥–µ–Ω–æ %d —Å–∏–º–≤–æ–ª—ñ–≤, –∑ –Ω–∏—Ö %d —É–Ω—ñ–∫–∞–ª—å–Ω–∏—Ö.' % (data_size, vocab_size))

–ó–Ω–∞–π–¥–µ–Ω–æ 55982 —Å–∏–º–≤–æ–ª—ñ–≤, –∑ –Ω–∏—Ö 49 —É–Ω—ñ–∫–∞–ª—å–Ω–∏—Ö.


In [13]:
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

In [14]:
# –ü–∞—Ä–∞–º–µ—Ç—Ä–∏ –º–æ–¥–µ–ª—ñ
N, h_size, o_size = vocab_size, vocab_size, vocab_size
seq_length = 25 # –î–æ–≤–∂–∏–Ω–∞ –ø–æ—Å–ª—ñ–¥–æ–≤–Ω–æ—Å—Ç—ñ.
learning_rate = 1e-1

In [15]:
# –Ü–Ω—ñ—Ü—ñ–∞–ª—ñ–∑–∞—Ü—ñ—è –≤–∞–≥ —Ç–∞ –∑–º—ñ—â–µ–Ω—å
Wz = np.random.rand(h_size, N) * 0.1 - 0.05
Uz = np.random.rand(h_size, h_size) * 0.1 - 0.05
bz = np.zeros((h_size, 1))

Wr = np.random.rand(h_size, N) * 0.1 - 0.05
Ur = np.random.rand(h_size, h_size) * 0.1 - 0.05
br = np.zeros((h_size, 1))

Wh = np.random.rand(h_size, N) * 0.1 - 0.05
Uh = np.random.rand(h_size, h_size) * 0.1 - 0.05
bh = np.zeros((h_size, 1))

Wy = np.random.rand(o_size, h_size) * 0.1 - 0.05
by = np.zeros((o_size, 1))

In [16]:
def gru_loss(inputs, targets, hprev):
    x, z, r, h_hat, h, y, p = {}, {}, {}, {}, {-1: hprev}, {}, {}
    sequence_loss = 0

    # –ü—Ä—è–º–∏–π —Ö—ñ–¥
    for t in range(len(inputs)):
        # –í—Ö—ñ–¥
        x[t] = np.zeros((vocab_size, 1))
        x[t][inputs[t]] = 1
        
        # –æ–±—á–∏—Å–ª–µ–Ω–Ω—è Gates
        z[t] = sigmoid(np.dot(Wz, x[t]) + np.dot(Uz, h[t-1]) + bz)
        r[t] = sigmoid(np.dot(Wr, x[t]) + np.dot(Ur, h[t-1]) + br)
        
        # –ü—Ä–∏—Ö–æ–≤–Ω–∏–π —à–∞—Ä
        h_hat[t] = tanh(np.dot(Wh, x[t]) + np.dot(Uh, np.multiply(r[t], h[t-1])) + bh)
        h[t] = np.multiply(z[t], h[t-1]) + np.multiply((1 - z[t]), h_hat[t])
        
        # –í–∏–≤—ñ–¥
        y[t] = np.dot(Wy, h[t]) + by
        
        # –ô–º–æ–≤—ñ—Ä–Ω—ñ—Å–Ω–∏–π —Ä–æ–∑–ø–æ–¥—ñ
        p[t] = softmax(y[t])
        
        loss = -np.sum(np.log(p[t][targets[t]]))
        sequence_loss += loss

    # –ü–∞—Ä–µ–º–µ—Ç—Ä–∏ –≥—Ä–∞–¥—ñ—î–Ω—Ç–Ω–æ–≥–æ —Å–ø—É—Å–∫—É
    dWy, dWh, dWr, dWz = np.zeros_like(Wy), np.zeros_like(Wh), np.zeros_like(Wr), np.zeros_like(Wz)
    dUh, dUr, dUz = np.zeros_like(Uh), np.zeros_like(Ur), np.zeros_like(Uz)
    dby, dbh, dbr, dbz = np.zeros_like(by), np.zeros_like(bh), np.zeros_like(br), np.zeros_like(bz)
    dhnext = np.zeros_like(h[0])
    
    # –ó–≤–æ—Ä–æ—Ç–Ω—ñ–π —Ö—ñ–¥ 
    for t in reversed(range(len(inputs))):
        # ‚àÇloss/‚àÇy
        dy = np.copy(p[t])
        dy[targets[t]] -= 1
        
        # ‚àÇloss/‚àÇWy —Ç–∞ ‚àÇloss/‚àÇby
        dWy += np.dot(dy, h[t].T)
        dby += dy
        
        # –ü—Ä–æ–º—ñ–∂–Ω—ñ –ø–æ—Ö—ñ–¥–Ω—ñ
        dh = np.dot(Wy.T, dy) + dhnext
        dh_hat = np.multiply(dh, (1 - z[t]))
        dh_hat_l = dh_hat * tanh(h_hat[t], True)
        
        # ‚àÇloss/‚àÇWh, ‚àÇloss/‚àÇUh —Ç–∞ ‚àÇloss/‚àÇbh
        dWh += np.dot(dh_hat_l, x[t].T)
        dUh += np.dot(dh_hat_l, np.multiply(r[t], h[t-1]).T)
        dbh += dh_hat_l
        
        # –ü—Ä–æ–º—ñ–∂–Ω—ñ –ø–æ—Ö—ñ–¥–Ω—ñ
        drhp = np.dot(Uh.T, dh_hat_l)
        dr = np.multiply(drhp, h[t-1])
        dr_l = dr * sigmoid(r[t], True)
        
        # ‚àÇloss/‚àÇWr, ‚àÇloss/‚àÇUr —Ç–∞ ‚àÇloss/‚àÇbr
        dWr += np.dot(dr_l, x[t].T)
        dUr += np.dot(dr_l, h[t-1].T)
        dbr += dr_l
        
        # –ü—Ä–æ–º—ñ–∂–Ω—ñ –ø–æ—Ö—ñ–¥–Ω—ñ
        dz = np.multiply(dh, h[t-1] - h_hat[t])
        dz_l = dz * sigmoid(z[t], True)
        
        # ‚àÇloss/‚àÇWz, ‚àÇloss/‚àÇUz and ‚àÇloss/‚àÇbz
        dWz += np.dot(dz_l, x[t].T)
        dUz += np.dot(dz_l, h[t-1].T)
        dbz += dz_l
        
        # –í—Ä–∞—Ö—É–≤–∞–Ω–Ω—è –≤–ø–ª–∏–≤—É –ø–æ–ø–µ—Ä–µ–¥–Ω—ñ—Ö —Ä—ñ–≤–Ω—ñ–≤ –Ω–∞ –≤—Ç—Ä–∞—Ç–∏
        dh_fz_inner = np.dot(Uz.T, dz_l)
        dh_fz = np.multiply(dh, z[t])
        dh_fhh = np.multiply(drhp, r[t])
        dh_fr = np.dot(Ur.T, dr_l)
        
        # ‚àÇloss/‚àÇhùë°‚Çã‚ÇÅ
        dhnext = dh_fz_inner + dh_fz + dh_fhh + dh_fr

    return sequence_loss, dWy, dWh, dWr, dWz, dUh, dUr, dUz, dby, dbh, dbr, dbz, h[len(inputs) - 1]

In [17]:
def sample(h, seed_ix, n):
    # –Ü–Ω—ñ—Ü—ñ–∞–ª—ñ–∑—É—î–º–æ –ø–µ—Ä—à–µ —Å–ª–æ–≤–æ.
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = [seed_ix]
    
    for t in range(n):
        # –û–±—á–∏—Å–ª—é—î–º–æ Update —Ç–∞ Reset Gates
        z = sigmoid(np.dot(Wz, x) + np.dot(Uz, h) + bz)
        r = sigmoid(np.dot(Wr, x) + np.dot(Ur, h) + br)
        
        # –†–∞—Ö—É—î–º–æ –ø—Ä–∏—Ö–æ–≤–∞–Ω—ñ —à–∞—Ä–∏
        h_hat = tanh(np.dot(Wh, x) + np.dot(Uh, np.multiply(r, h)) + bh)
        h = np.multiply(z, h) + np.multiply((1 - z), h_hat)
        
        # –†–∞—Ö—É—î–º–æ –≤–∏—Ö–æ–¥–∏
        y = np.dot(Wy, h) + by
        
        # –ô–º–æ–≤—ñ—Ä–Ω—ñ—Å–Ω–∏–π —Ä–æ–∑–ø–æ–¥—ñ–ª
        p = softmax(y)

        # –í–∏–±–∏—Ä–∞—î–º–æ –Ω–∞—Å—Ç—É–ø–Ω–∏–π –ø–∞—Ä–∞–º–µ—Ç—Ä –≤—ñ–¥–ø–æ–≤—ñ–¥–Ω–æ –¥–æ —Ä–æ–∑–ø–æ–¥—ñ–ª—É
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    
    return ixes

In [18]:
n, p = 0, 0
# –Ü–Ω—ñ—Ü—ñ–∞–ª—ñ–∑–∞—Ü—ñ—è –ø—Ä–∞–º–µ—Ç—Ä—ñ–≤ –≥—Ä–∞–¥—ñ—î–Ω—Ç–Ω–æ–≥–æ —Å–ø—É—Å–∫—É
mdWy, mdWh, mdWr, mdWz = np.zeros_like(Wy), np.zeros_like(Wh), np.zeros_like(Wr), np.zeros_like(Wz)
mdUh, mdUr, mdUz = np.zeros_like(Uh), np.zeros_like(Ur), np.zeros_like(Uz)
mdby, mdbh, mdbr, mdbz = np.zeros_like(by), np.zeros_like(bh), np.zeros_like(br), np.zeros_like(bz)
smooth_loss = -np.log(1.0/vocab_size)*seq_length

print_interval = 100

while True:
    # –°–∫–∏–¥—É–≤–∞–Ω–Ω—è (reset) –ø–∞–º'—è—Ç—ñ
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros((h_size, 1))
        p = 0
    
    # —Ä–∞—Ö—É—î–º–æ –≤—Ö–æ–¥–∏ —Ç–∞ –≤–∏—Ö–æ–¥–∏
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # –ü—Ä–æ–º—ñ–∂–Ω–∞ –ø–µ—Ä–µ–≤—ñ—Ä–∫–∞ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ñ–≤
    if n % print_interval == 0:
        sample_ix = sample(hprev, inputs[0], 1000)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n%s\n----' % (txt, ))

    # Get gradients for current model based on input and target sequences
    loss, dWy, dWh, dWr, dWz, dUh, dUr, dUz, dby, dbh, dbr, dbz, hprev = gru_loss(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    # –í–∏–≤—ñ–¥ –∑–Ω–∞—á–µ–Ω–Ω—è –≤—Ç—Ä–∞—Ç
    if n % print_interval == 0:
        print('%d) loss: %f, smooth loss: %f' % (n, loss, smooth_loss))

    # –û–Ω–æ–≤–æ–ª–µ–Ω–Ω—è –ø–∞—Ä–µ–º–µ—Ç—Ä—ñ–≤ –≥—Ä–∞–¥—ñ—î–Ω—Ç–Ω–æ–≥–æ —Å–ø—É—Å–∫—É
    for param, dparam, mem in zip([Wy,  Wh,  Wr,  Wz,  Uh,  Ur,  Uz,  by,  bh,  br,  bz],
                                  [dWy, dWh, dWr, dWz, dUh, dUr, dUz, dby, dbh, dbr, dbz],
                                  [mdWy,mdWh,mdWr,mdWz,mdUh,mdUr,mdUz,mdby,mdbh,mdbr,mdbz]):
        np.clip(dparam, -5, 5, out=dparam)
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # —â–æ–± –∑–∞–ø–æ–±—ñ–≥—Ç–∏ –¥—ñ–ª–µ–Ω–Ω—é –Ω–∞ 0

    # –ü–µ—Ä–µ–¥ –ø–µ—Ä–µ—Ö–æ–¥–æ–º –Ω–∞ –Ω–∞—Å—Ç—É–ø–Ω—É —ñ—Ç–µ—Ä–∞—Ü—ñ—é
    p += seq_length
    n += 1

    if n == 50000:
        break

----
F,EcCarBrtj,-PTBgOkMc.vIq"s tCb"BLDWqAhV"smiCAvvEOGjIOWPnMjrulAFkbhBCfBfLtWs‚Äô-.L"GtqDDOjl‚ÄôVw.
rh,SqMykm‚Äôokisj
kagb uvWuvq‚ÄôBVSvCoCnmfGpt qTb -Lettx
T‚ÄôdMraWk.Qgl -GacS"c"OhM-dBvLTgf‚Äô
SAwyOhAuVcBtdnVd-eQvhIBkoLk‚ÄôWFL"uOpqrG vPq‚Äô-qV.nt uSO Fd dSCMkA"bTIjA"u.."xdMGidO.CbiwV"CPFLbq
ib.FluvPgSVCdGmyIcmmjGembvnwe,utEFe".Ocxg-ggvGcLCmnrFcWsgOxdrBOmey,
xM-.sVLPsVW ILiwmoTFILoTeuLyyV-bVV-pnQ"a
EmcwDbtw,rhM f,ju‚Äô.MstLphSIEfFd y
-xsdhwrh.w.BO
iBhdGAqQFjeStmLCfTvo‚ÄôfBjQffBosyIhTwregEVaCAayP‚Äôhdse gBAiBBeF,vdL ruhQdAc.
fWEvak L,LfbwvDTuvdyWxuI,DSQjSMBvPCdFx.bfImwGfggl
CaImAbhDqwoWirEnQMgrGMt,P‚ÄôMFxFMjAaAhobsMfVLOLDxOyhE,lmSgtuFFgmD
ExVwxh,AQoW"c
vkweQqVwn-oq "Ao‚ÄôrmVjLbQP-bEeMCqkugCVrylBl‚Äôkjwqor-ksrBMcIFqpvebtsPew" ,IsWInk
-IrxcDe fPcnDwIgfI,ivPfFeGy-OGScTjpC
xPgBoVjfpAhwCfy."D ek
VQopn
D
‚ÄôwcEgFTsbstesVuBSwDnSpMa-dFDFnbAawvAbbnd.yveMPl,-tWFd"ghxQm
Aorrpo BSkslA‚ÄôDj"Vm-wME-g.sm-‚Äôw‚Äô.EbfDxyDaIcWat‚Äô, jC
mqfb"bhuVF krMhj Gl‚ÄôLwa.bMk
mpSnsTjfp kGgrdjq dToyTIhdW,SfEjbLy