In [2]:
import sys
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
import json

from datetime import datetime
from sklearn.utils import shuffle


In [3]:
def get_sentences_with_word2index():
    # Brown has 574340 sentences
    # Eacg sentence is represented as a list of individual string tokens
    sentences = brown.sents()
    word2index = {'START':0, 'END':1}
    indexed_sentences = []
            
    index = 2
    for sentence in sentences:
        indexed_sentence = []
        for token in sentence:
            token = token.lower()
            if token not in word2index:
                word2index[token] = index
                index+=1
            indexed_sentence.append(word2index[token])
        indexed_sentences.append(indexed_sentence)
    print("vocb size:", index)
    return indexed_sentences, word2index

In [1]:
def init_weight(Mi, Mo):
    return np.random.randn(Mi, Mo) / np.sqrt(Mi + Mo)

In [5]:
class LSTM(object):
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.activation = activation
        
        W_xi = init_weight(Mi, Mo)
        W_ci = init_weight(Mo, Mo)
        W_hi = init_weight(Mo, Mo)
        bi = np.zeros(Mo)
        
        W_xf = init_weight(Mi, Mo)
        W_cf = init_weight(Mo, Mo)
        W_hf = init_weight(Mo, Mo)
        bf = np.zeros(Mo)
        
        W_xo = init_weight(Mi, Mo)
        W_co = init_weight(Mo, Mo)
        W_ho = init_weight(Mo, Mo)
        bo = np.zeros(Mo)
        
        W_xc = init_weight(Mi, Mo)
        W_hc = init_weight(Mo, Mo)
        bc = np.zeros(Mo)
        
        c0 = np.zeros(Mo)
        h0 = np.zeros(Mo)
        
        self.W_xi = theano.shared(W_xi)
        self.W_ci = theano.shared(W_ci)
        self.W_hi = theano.shared(W_hi)
        self.bi = theano.shared(bi)
        
        self.W_xf = theano.shared(W_xf)
        self.W_cf = theano.shared(W_cf)
        self.W_hf = theano.shared(W_hf)
        self.bf = theano.shared(bf)
        
        self.W_xo = theano.shared(W_xo)
        self.W_co = theano.shared(W_co)
        self.W_ho = theano.shared(W_ho)
        self.bo = theano.shared(bo)
        
        self.W_xc = theano.shared(W_xc)
        self.W_hc = theano.shared(W_hc)
        self.bc = theano.shared(bc)
        
        self.c0 = theano.shared(c0)
        self.h0 = theano.shared(h0)
        
        self.params = [self.W_xi, self.W_ci, self.W_hi, self.bi,
                       self.W_xf, self.W_cf, self.W_hf, self.bf,
                       self.W_xo, self.W_co, self.W_ho, self.bo,
                       self.W_xc, self.W_hc, self.bc, self.c0, self.h0]
        
    def recurrence(self, x_t, h_t1, c_t1):
        
        i_t = T.nnet.sigmoid(x_t.dot(self.W_xi) + h_t1.dot(self.W_hi) + c_t1.dot(self.W_ci) + self.bi)
        f_t = T.nnet.sigmoid(x_t.dot(self.W_xf) + h_t1.dot(self.W_hf) + c_t1.dot(self.W_cf) + self.bf)
        
        c_hat_t = self.f(x_t.dot(self.W_xc) + h_t1.dot(self.W_hc) + self.bc)
        
        c_t = f_t * c_t1 + i_t * c_hat_t
        
        # note o_t depends on c_t, not c_t1
        o_t = T.nnet.sigmoid(x_t.dot(self.W_xo) + h_t1.dot(sefl.W_ho) + c_t.dot(self.W_co) + self.bo)
        
        h_t = o_t * self.f(c_t)
        
        return h_t, c_t
    
    def output(self, x):
        [h, c], _ = theano.scan(
            fn=self.recurrence,
            sequence=x,
            outputs_info=[self.h0, self.c0],
            n_steps=x.shape[0]
        )
        return h
          

In [6]:
class RNN(object):
    def __init__(self, V, D, hidden_layer_sizes):
        self.V = V;
        self.D = D
        self.hidden_layer_sizes = hidden_layer_sizes
        
    def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, 
            RecurrentUnit=LSTM, normalize=True):
        
        D = self.D
        V = self.V
        N = len(X)
        
        ### initialize hidden layers (i.e., recurrent units)
        
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo)
            self.hidden_layers.append(ru)
            Mi = Mo
            
        ### initialize weights for word embedding layer and output layer
        
        We = init_weight(V, D)
        Wo = init_weight(Mi, V) 
        bo = np.zeros(V)
        
        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        
        # Note we do collect self.We here
        self.params = [self.Wo, self.bo]
        
        ### create training vector
        
        thx = theano.ivector('X')
        thy = theano.ivector('Y')
        
        ### forward propagation
        
        # get sequence of word embedding from the input sequence of word indexed
        Z = self.We[thx]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)
        
        prediction = T.argmax(py_x, axis=1)
        prediction_op = theano.function(
            inputs=[thx],
            outputs=[py_x, prediction],  
            allow_input_downcast=True,
        )
        
        ### create symbolic expressions for gradient descent
            
        cost = -T.mean(T.log(py_x[T.arange(thy.shape[0]), thy]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]
        
        gWd = T.grad(cost, self.We)
        dWe = theano.shared(self.We.get_value()*0)
        dWe_update = mu*dWe - learning_rate*gWd
        We_update = self.We + dWe_update
        
        # Why we normalize We here not all of them or none of them ???
        if normalize:
            We_update /= We_update.norm(2)
            
        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate *g) for dp, g in zip(dparams, grads)
        ] + [
            (self.We, We_update),(dWe, dWe_update)
        ]
        
        self.train_op = theano.function(
            inputs=[thx, thy],
            outputs=[cost, prediction],
            updates=updates,
        )
        
        ### training
        costs=[]
        for i in range(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            cost=0
            n_correct=0
            n_total=0
            
            for j in N:
                if np.random.random() < 0.01 or len(X[j]) <=1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)
                
                try:
                    c, p = self.train_op(input_sequence, output_sequence)
                except Exception as e:
                    py_x, pred = self.predict_op(input_sequence)
                    print("input_sequence len:", len(input_sequence))
                    print("py_x.shape", py_x.shape)
                    print("pred.shape", pred.shape)
                    raise e
                cost+=c
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct+=1
                
                if j % 200 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct)/n_total))
                    sys.stdout.flush()
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), 
                  "time for epoch:", (datetime.now() - t0)
            costs.append(cost)
        
        
        if show_fig:
            plt.plot(costs)
            plt.show()
    
        

SyntaxError: invalid syntax (<ipython-input-6-b242e266a7e1>, line 120)