# Coding Basic for RNN

### Generating Poetry
* Unsupervised
* Softmax output

* We need word embeddings
* SimpleRNN(D, M ,V)
* D is embedding size, M is hidden layer size and V is vocabulary size
* $ W_e $ is a $ V \times D $ matrix 

### The data

* Robert Frost poems, about 1500 lines
* Each line is a separate sequence
* Each line: lowercase / remove punctuation / split by whitespace -> tokens
* Give each token an index in word2idx map (zeor-based)
* Save each sentence as sequence of indices
* Return sentences and word2idx map
* Same process we will follow for most language models, with some modifications for more complicated datasets

In [27]:
import theano
import theano.tensor as T
import numpy as np
import string
import matplotlib.pyplot as plt
from nltk import pos_tag, word_tokenize
from sklearn.utils import shuffle

# from util import init_weight, all_parity_pairs_with_sequence_labels

In [28]:
def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))

In [29]:
y = remove_punctuation("Hello,. Yan Kang")
print(y)

Hello Yan Kang


In [30]:
def get_robert_frost():
    word2idx = {'START':0, 'END':1}
    current_idx = 2
    sentences = []
    for line in open("../data/hmm/robert_frost.txt"):
        line = line.strip()
        if line:
            tokens = remove_punctuation(line.lower()).split()
            sentence=[]
            for t in tokens:
                if t not in word2idx:
                    word2idx[t] = current_idx
                    current_idx += 1
                sentence.append(word2idx[t])
            sentences.append(sentence)
    return sentences, word2idx 

In [31]:
def init_weight(Mi, Mo):
    return np.random.randn(Mi, Mo) / np.sqrt(Mi + Mo)

In [34]:
class SimpleRNN:
    def __init__(self, D, M, V):
        self.D = D  # dimensionality of word embedding
        self.M = M  # hidden layer size
        self.V = V  # vocabulary size
        
    def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
        
        N = len(X)
        D = self.D
        M = self.M
        V = self.V
        
        # initialize weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)
        
        ################################################
        ###  Prediction through forward propagation  ###
        ################################################
        
        self.f = activation
            
        # Define all shared parameters that would be updated during the training
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.Wo = theano.shared(Wo)
        self.bh = theano.shared(bh)
        self.bo = theano.shared(bo)
        self.h0 = theano.shared(h0)
        
        # Collect all the parameters to make it easy to do gradient descent
        self.params = [self.We, self.Wx, self.Wh, self.Wo, self.bh, self.h0, self.bo]
        
        # a sequence of word indices
        thX = T.ivector('X')
        # a squence of word labels
        thY = T.ivector('Y')
        
        # Get the word embeddings indexed by the index sequence, i.e., thX
        # Ei has shape T x D. Ei can also be think of as a sequence of T timesteps 
        # with D features per timestep
        Ei = self.We[thX] 
        
        #### Define the function to perform the recurrent unit
        def recurrence(x_t, h_t1):
            
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            
            # For general form: T.nnet.softmax(X.dot(W) + b)
            # It is the symbolic expression for computing the matrix of class-membership probabilities
            # Where:
            # W is a matrix where column-k represent the separation hyperplane for class-k
            # x is a matrix where row-j  represents input training sample-j
            # b is a vector where element-k represent the free parameter of hyperplane-k
            #
            # In this particular case:
            # h_t is the hidden vector represents a word at time/step t of the input sequence
            # y_t is the predicted next-word probability distribution over words in the vocabulary
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t
            
        [h, y], _ = theano.scan(
            fn=recurrence,
            sequences=Ei,
            outputs_info=[self.h0, None],
            n_steps=Ei.shape[0],
        )

        # 
        py_x = y[:, 0, :]
        
        # symbolic description of how to compute prediction as class whose
        # probability is maximal
        prediction = T.argmax(py_x, axis=1)
        self.prediction_op = theano.function(inputs=[thX], outputs=prediction)
        
        ############################################
        ###  Define Cross Entropy and Optimizer  ###
        ############################################
        
        # Symbolic description of the cross entropy
        # Note, here utilizing matrix indexing to calculate the cross entropy
        
        # - py_x a matrix of Probabilities with one row per element and one column per class 
        # - T.log(py_x) is a matrix of Log-Probabilities (call it LP)
        # - y.shape[0] is (symbolically) the number of rows in y, i.e., number of elements (call it n) in the sequence.
        # - T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] 
        # - LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
        #   LP[n-1,y[n-1]]] and 
        # - T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across sequence) of the elements in v,
        #   i.e., the mean log-likelihood across the sequence.
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        
        # Symbolic expression of the gradient descent
        grads = T.grad(cost, self.params)
        
        # Initialize momentum for all shared parameters
        dparams = [theano.shared(p.get_value()*0) for p in self.params]
        
        # Define rules for updating gradients and momentum
        # given two lists of the same length, A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4], 
        # zip generates a list C of same size, where each element is a pair formed from the two lists :
        # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]
        
        # compiling a Theano function `train_op` that returns the cost, but in
        # the same time updates the parameter of the model based on the rules
        # defined in `updates`
        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction, y, h, py_x],
            updates=updates,
        )
        
        
        ################################################
        ###  Start Training through Backpropagation  ###
        ################################################
        
        costs = []
        
        # Calculate the total number of words in all sentences
        n_total = sum((len(sentence) + 1) for sentence in X)
        
        for i in range(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in range(N):
                
                # Using stochastic gradient descent
                # Question: Is it possible using batch gradient descent for sequence with different length?
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]
                c, p, y_, h_, py_x_ = self.train_op(input_sequence, output_sequence)
                print("y shape:", y_.shape)
                print("h shape:", h_.shape)
                print("py_x shape:", py_x_.shape)
                cost += c
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
                        
            if i % 20 == 0:
                print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/ n_total))
            costs.append(cost)
        
        if show_fig:
            plt.plot(costs)
            plt.show()
   
    def set_weights(self, We, Wx, Wh, Wo, bh, h0, bo, activation):

        ### Copied over from section "Prediction through forward propagation" in fit method.
        ### We can modularize this section and reuse it. 
        
        self.f = activation
        
        # Define all shared parameters that would be updated during the training
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.Wo = theano.shared(Wo)
        self.bh = theano.shared(bh)
        self.bo = theano.shared(bo)
        self.h0 = theano.shared(h0)
        
        # Collect all the parameters to make it easy to do gradient descent
        self.params = [self.We, self.Wx, self.Wh, self.Wo, self.bh, self.h0, self.bo]
        
        # a sequence of indices
        thX = T.ivector('X')
        
        # Get the word embeddings indexed by the index sequence, i.e., thX
        # Ei has shape T x D. Ei can also be think of as a sequence of T timesteps 
        # with D features per timestep
        Ei = self.We[thX] 
        
        thY = T.ivector('Y')
        
        # Define the function to perform the recurrent unit
        def recurrence(x_t, h_t1):
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t
            
        [h, y], _ = theano.scan(
            fn=recurrence,
            sequences=Ei,
            outputs_info=[self.h0, None],
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]
        
        prediction = T.argmax(py_x, axis=1)
        self.prediction_op = theano.function(
            inputs=[thX], 
            outputs=prediction,
            allow_input_downcast=True,
        )
    
    
    def save(self, filename):
        np.savez(filename, *[p.get_value() for p in self.params])

    @staticmethod
    def load(filename, activation):
        npz = np.load(filename)
        We = npz['arr_0']
        Wx = npz['arr_1']
        Wh = npz['arr_2']
        Wo = npz['arr_3']
        bh = npz['arr_4']
        h0 = npz['arr_5']
        bo = npz['arr_6']
        
        V, D = We.shape
        _, M = Wx.shape
        rnn = SimpleRNN(D, M, V)
        rnn.set_weights(We, Wx, Wh, Wo, bh, h0, bo, activation)
        return rnn
            
    def generate(self, pi, word2idx):
        idx2word = {v:k for k, v in word2idx.items()}
        V = len(pi)

        n_lines = 0

        # get the first word according to the probabilty distribution
        X = [np.random.choice(V, p=pi)]
        print(idx2word[X[0]])

        while n_lines < 4:
            P = self.prediction_op(X)[-1]
            X += [P]
            if  P > 1:
                word = idx2word[P]
                print(word)
            elif P == 1:
                n_lines += 1
                print()
                if n_lines < 4:
                    X = [np.random.choice(V, p=pi)] 
                    print(idx2word[X[0]])


In [35]:
sentences, word2idx = get_robert_frost()
V = len(word2idx)
rnn = SimpleRNN(30, 30, V)
rnn.fit(sentences, learning_rate=10e-5, show_fig=True, activation=T.nnet.relu, epochs=1)    

y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (5, 1, 2199)
h shape: (5, 30)
py_x shape: (5, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: 

y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (5, 1, 2199)
h shape: (5, 30)
py_x shape: (5, 2199)
y shape: (9,

y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (6, 1, 2199)
h shape: (6, 30)
py_x shape: (6, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y s

y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (5, 1, 2199)
h shape: (5, 30)
py_x shape: (5, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (6, 1, 2199)
h shape: (6, 30)
py_x shape: (6, 2199)
y shape: (5,

py_x shape: (10, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (6, 1, 2199)
h shape: (6, 30)
py_x shape: (6, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (10, 1, 2199)
h shape: (10, 3

h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (3, 1, 2199)
h shape: (3, 30)
py_x shape: (3, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (9, 1, 2199)
h shape: (9,

y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (5, 1, 2199)
h shape: (5, 30)
py_x shape: (5, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (6, 1, 2199)
h shape: (6, 30)
py_x shape: (6, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y s

y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (4, 1, 2199)
h shape: (4, 30)
py_x shape: (4, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (10, 1, 2199)
h shape: (10, 30)
py_x shape: (10, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (11, 1, 2199)
h shape: (11, 30)
py_x shape: (11, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (7, 1, 2199)
h shape: (7, 30)
py_x shape: (7, 2199)
y shape: (8, 1, 2199)
h shape: (8, 30)
py_x shape: (8, 2199)
y shape: (9, 1, 2199)
h shape: (9, 30)
py_x shape: (9, 2199)
y shap

KeyboardInterrupt: 

In [185]:
# 
pi = np.zeros(V)
for sentence in sentences:
    pi[sentence[0]] += 1
pi /= pi.sum()
    
rnn.generate(pi, word2idx)

tried

he

a

when



In [26]:
P = [np.random.choice(5, p=[0.1, 0, 0.3, 0.6, 0])]
print(P)
print(P[0])

[3]
3


In [52]:
X = T.fvector('X')
data = np.array([4, 6, 12])
print(data.shape)
Y = T.nnet.softmax(X).eval({ X: date })
print(Y)
print(Y.shape)

(3,)
[[  3.34521203e-04   2.47179600e-03   9.97193694e-01]]
(1, 3)


In [49]:
X = T.fmatrix('X')
data = np.array([[4, 6, 12], [4, 6, 12]] )
print(data.shape)
Y = T.nnet.softmax(X).eval({ X: [[4, 6, 12], [4, 6, 12]] })
print(Y)
print(Y.shape)

(2, 3)
[[  3.34521203e-04   2.47179600e-03   9.97193694e-01]
 [  3.34521203e-04   2.47179600e-03   9.97193694e-01]]
(2, 3)
