In [108]:
# simplest model, performing very poorly

import numpy as np
import pandas as pd
import time
from sklearn import preprocessing
from utils_GRU import GRU_cell, softmax

np.random.seed(7)
tic = time.perf_counter()

def accuracy(ytest, ypred):
    return float(np.sum(np.all(ytest == ypred, axis=1))/ float(ytest.shape[0]))

def accuracy_(ytest, ypred):
    return float(np.sum(ytest == ypred)/ float(ytest.size))

class seq2seq():
    def __init__(self, X_shape, Y_shape, H=50, lr=0.0001):
        
        self.encoder = []
        self.decoder = []
        # X is of shape (batch_size, seq_length, onehot_dim)
        self.batch_size, self.n_encoder_cells, self.D = X_shape
        _, self.n_decoder_cells, self.n_classes = Y_shape
        # in this toy example, the onehot_dim is the n_classes
        self.H = H 

        sos = np.zeros(self.n_classes)
        sos[0] = 1
        self.sos = np.zeros((self.batch_size, self.n_classes)) + sos

        W = np.random.randn(self.H + self.D, self.H)
        Wr = np.random.randn(self.H + self.D, self.H)
        Wz = np.random.randn(self.H + self.D, self.H)
        Wy = np.random.randn(self.H, self.n_classes)
        by = np.zeros(self.n_classes)

        self.Wy = Wy
        self.by = by
        self.h_init = np.random.randn(self.batch_size, self.H)

        for i in range(self.n_encoder_cells):
            self.encoder.append(GRU_cell(W, Wz, Wr, Wy, by))
        for i in range(self.n_decoder_cells):
            self.decoder.append(GRU_cell(W, Wz, Wr, Wy, by))
        self.lr = lr
        
        self.lb = preprocessing.LabelBinarizer()
        self.lb.fit(np.arange(-1, self.n_classes-1)) # the n classes are <sos> 0, 1,...,n_classes-1

    def feed_data(self, X, Y):
        self.X = X 
        self.Y = Y

    def feed_forward(self):
        """
            Xt here is X[:, i, :] 
        """
        self.y_prob = np.zeros(self.Y.shape) 
        h = self.h_init
        y = self.sos
        for i, cell in enumerate(self.encoder):
            h = cell.feed_forward(np.flip(self.X, axis=1)[:, i, :], h)
        for i, cell in enumerate(self.decoder):
            h = cell.feed_forward(y, h) 
            self.y_prob[:, i, :] = softmax(np.dot(h, self.Wy) + self.by) 
            y = self.lb.transform(np.argmax(self.y_prob[:, i, 1:], axis=1))
            

    def back_propagation(self):
        """
        L = L_0 + ... + L_{n_decoder_cells}
        """
        
        dWy = 0
        dby = 0
        dh_last = 0 # here h_last is the final hidden vector of the encoder.

        decoder_dW = [] # store all dLt/dW for any t
        decoder_dWr = []
        decoder_dWz = []

        for i in np.arange(self.n_decoder_cells):
            # dL_t/dW_hy = np.dot(h^t.T, y_prob[:, i, :] - y[:, i, :])
            dLt_dWy = np.dot(self.decoder[i].h_output.T, self.y_prob[:, i, :] - self.Y[:, i, :])
            dLt_dby = np.sum(self.y_prob[:, i, :] - self.Y[:, i, :], axis=0) 
            dWy += dLt_dWy
            dby += dLt_dby

            dLt_dht = np.dot(self.y_prob[:, i, :] - self.Y[:, i, :], self.Wy.T)
            gradient = dLt_dht
            for cell in reversed(self.decoder[:i+1]):
                gradient = cell.back_propagation(gradient)
            dh_last += gradient # gradient is indeed dLt/dh_last

            dLt_dW = 0 # This is gonna be the graident dLt/dWh for the decoder 
            dLt_dWr = 0 
            dLt_dWz = 0
            for cell in self.decoder[:i+1]:
                dLt_dW += cell.dW
                dLt_dWr += cell.dWr 
                dLt_dWz += cell.dWz
            decoder_dW.append(dLt_dW)
            decoder_dWr.append(dLt_dWr)
            decoder_dWz.append(dLt_dWz)
            
        dh = dh_last
        # Now do the back propagation 
        for cell in reversed(self.encoder):
            dh = cell.back_propagation(dh)

        dW = sum(decoder_dW)  
        dWr = sum(decoder_dWr)
        dWz = sum(decoder_dWz)
        # dL/dW_hx = dL/dh_0 dh_0/dW_hx + dL/dh_1 dh_1/dW_hx + ... + dL/dh_{n_cells-1} dh_{n_cells-1}/dW_hx
        for cell in self.encoder:
            dW += cell.dW
            dWr += cell.dWr
            dWz += cell.dWz

        for cell in self.encoder:
            cell.W -= self.lr * dW
            cell.Wr -= self.lr * dWr 
            cell.Wz -= self.lr * dWz
            cell.Wy -= self.lr * dWy
            cell.by -= self.lr * dby

        for cell in self.decoder:
            cell.W -= self.lr * dW
            cell.Wr -= self.lr * dWr 
            cell.Wz -= self.lr * dWz
            cell.Wy -= self.lr * dWy
            cell.by -= self.lr * dby


    def predict(self, Xtest):
        # Xtest has the same shape as that of X
        y_prob = np.zeros(Xtest.shape) 
        h = self.h_init
        y = self.sos
        for i, cell in enumerate(self.encoder):
            h = cell.feed_forward(np.flip(Xtest, axis=1)[:, i, :], h)
        for i, cell in enumerate(self.decoder):
            h = cell.feed_forward(y, h) 
            y_prob[:, i, :] = softmax(np.dot(h, self.Wy) + self.by) 
            y = self.lb.transform(np.argmax(self.y_prob[:, i, 1:], axis=1))
        return np.argmax(y_prob[:, :, 1:], axis = 2) # The first column is always 0, it 
    # it represents the <sos>

    def cross_entropy_loss(self):
        """
            calculate loss after doing feed forward.
        """
        # L = \sum_t L_t 
        loss = 0
        for i in np.arange(self.n_decoder_cells):
            loss += -np.sum(self.Y[:, i, :]*np.log(self.y_prob[:, i, :] + 1e-6))
        return loss


X = np.loadtxt('X6.txt', delimiter= ' ').astype(int)
idx = np.arange(216)
np.random.shuffle(idx)
X=X[idx]

Y = (X+1)%3

lb = preprocessing.LabelBinarizer()
lb.fit(np.arange(-1, 6))

X_ohe = np.zeros((216, 3, 7))
Y_ohe = np.zeros((216, 3, 7))
for i in range(216):
    X_ohe[i] = lb.transform(X[i])
    Y_ohe[i] = lb.transform(Y[i])

batch_size = 1

toy_seq2seq = seq2seq(X_shape=(batch_size, 3, 7), Y_shape=(batch_size, 3, 7), H = 120, lr=0.0001)
epochs = 80
for i in range(epochs):
    score = 0
    loss = 0
    for j in range(int(180/batch_size)):
        toy_seq2seq.feed_data(X_ohe[batch_size*j:batch_size*(j+1)], Y_ohe[batch_size*j:batch_size*(j+1)])
        toy_seq2seq.feed_forward()
        toy_seq2seq.back_propagation()
        y_pred = toy_seq2seq.predict(X_ohe[batch_size*j:batch_size*(j+1)])
        score += accuracy_(y_pred, Y[batch_size*j:batch_size*(j+1)])
        loss += toy_seq2seq.cross_entropy_loss()
    score = score/int(180/batch_size)

    if ((i + 1) % 20) == 0:
        print('epoch = {}, current loss = {}, train accuracy = {:.2f}%'.format(i+1, loss, 100*score))
        #print('epoch = {}, current loss = {}'.format(i+1, toy_seq2seq.cross_entropy_loss()))

toc = time.perf_counter()
print('Totol time: {:.2f}s'.format(toc-tic))
print('===============================Finish===================================')

epoch = 20, current loss = 260.8554793592496, train accuracy = 92.41%
epoch = 40, current loss = 101.79119865680313, train accuracy = 98.15%
epoch = 60, current loss = 55.569389712859355, train accuracy = 99.44%
epoch = 80, current loss = 33.421936228837936, train accuracy = 100.00%
Totol time: 60.08s


In [109]:
Xtest_ohe = X_ohe[180:]
Ytest = Y[180:]
test_score = 0
for i in range(36):
    test_score+=accuracy_(toy_seq2seq.predict(Xtest_ohe[i*batch_size:(i+1)*batch_size]), Ytest[i*batch_size:(i+1)*batch_size])
    #print(test_score)
test_score/36

0.9166666666666666