In [29]:
import sys
import theano
import theano.tensor as T
import numpy as np
import string
import matplotlib.pyplot as plt
import json
import nltk
import operator
from nltk import pos_tag, word_tokenize
from sklearn.utils import shuffle
from datetime import datetime
from nltk.corpus import brown
from nltk.corpus import stopwords


In [17]:
def init_weight(Mi, Mo):
    return np.random.randn(Mi, Mo) / np.sqrt(Mi + Mo)

In [96]:
class SimpleRNN(object):
    def __init__(self, M, V):
        self.M = M
        self.V = V
        
    def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):

        self.K = len(set(Y))
        print("V:", self.V)
        print("# of classes:", self.K)
        print("learning rate:", learning_rate)
        print("mu:", mu)
        print("reg:", reg)
        print("epochs:", epochs)
        
        X, Y = shuffle(X, Y)
        Nvalid = 10
        Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:]
        X, Y = X[:-Nvalid], Y[:-Nvalid]
        N = len(X)
        
        Wx = init_weight(self.V, self.M)
        Wh = init_weight(self.M, self.M)
        Wo = init_weight(self.M, self.K)
        bh = np.zeros(self.M)
        h0 = np.zeros(self.M)
        bo = np.zeros(self.K)
        
        ### forward/prediction part 
        
        thX, thY, py_x, prediction = self.set_forward(Wx, Wh, Wo, bh, h0, bo, activation)
        
        ### gradient descent and optimizer
        
        ## compute gradients (derivatives w.r.t self.params)
        cost = -T.mean(T.log(py_x[thY]))
        grads = T.grad(cost, self.params)
        
        ## optimization: update/optimize self.params
        
        # pitfall:
        dparams = [theano.shared(p.get_value()*0) for p in self.params]
        
        # we define learning rate as a theano variable since we will update learning rate after each epoch
        lr = T.scalar('learning_rate')
        
        updates = [
            (p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - lr*g) for dp, g in zip(dparams, grads)
        ]
        
        self.train_op = theano.function(
            inputs=[thX, thY, lr],
            outputs=[cost, prediction],
            updates= updates,
            allow_input_downcast=True,
        )
        
        ### training
        print("start training: ")
        
        costs=[]
        
        for i in range(epochs):
            t0 = datetime.now()
            # pitfall:
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(N):
                
                # Using stochastic gradient descent
                try:
                    c, p = self.train_op(X[j], Y[j], learning_rate)
                except Exception as e:
                    print("====")
                    print("====")
                    print("datum: ", X[j])
                    print("input_sequence len:", len(X[j]))
                    pred = self.prediction_op(X[j])
                    print("pred.shape", pred.shape)
                    raise e
                    
                cost += c
                if p == Y[j]:
                    n_correct += 1
                    
                if j % 1 == 0:
                    sys.stdout.write("epoch: %d, j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (i, j, N, float(n_correct)/N, cost))
                    sys.stdout.flush()
        
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/N), "time for epoch:", (datetime.now() - t0))
            
            learning_rate *= 0.9999
            
            n_correct_valid = 0
            for j in range(Nvalid):
                p = self.prediction_op(Xvalid[j])
                if p == Yvalid[j]:
                    n_correct_valid += 1
                
            if i % 20 == 0:
                print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/ N))
                print("validation correct rate:", (float(n_correct_valid/Nvalid)))
            costs.append(cost)
        
        if show_fig:
            plt.plot(costs)
            plt.show()
            
        
    def set_forward(self, Wx, Wh, Wo, bh, h0, bo, activation):
        
        self.f = activation
        
        self.Wx = theano.shared(Wx, 'Wx')
        self.Wh = theano.shared(Wh, 'Wh')
        self.Wo = theano.shared(Wo, 'Wo')
        self.bh = theano.shared(bh, 'bh')
        self.bo = theano.shared(bo, 'bo')
        # initial hidden states
        self.h0 = theano.shared(h0, 'h0')
        
        # TODO: we may not need to do gradient descent on self.h0
        self.params = (self.Wx, self.Wh, self.Wo, self.bh, self.h0, self.bo)
        
        thX = T.ivector('X')
        
        # since we only have one target per sequence, the thY is scalar
        thY = T.iscalar('Y')
        
        ## recurrent part
        
        def recurrence(x_t, h_t1):
            # NOTE: we index Wx with POS tag vector
            h_t = self.f(self.Wx[x_t] + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) 
            return h_t, y_t
        
        
        [h, y], _ = theano.scan(
            fn=recurrence,
            sequences=thX,
            outputs_info=[self.h0, None],
            n_steps=thX.shape[0],
        )
        
        # only interested in the final classification of the sequence
        py_x = y[-1, 0, :]
        prediction = T.argmax(py_x)
        
        self.prediction_op = theano.function(
                inputs=[thX],
                outputs=prediction,
                allow_input_downcast=True,
        )  
        
        return thX, thY, py_x, prediction
        
    def save(self, filename):
        np.savez(filename, *[p.get_value() for p in self.params])
        
    @staticmethod
    def load(filename, activation):
        npz = np.load(filename)
        Wx = npz['arr_0']
        Wh = npz['arr_1']
        Wo = npz['arr_2']
        bh = npz['arr_3']
        h0 = npz['arr_4']
        
        V, M = Wx.shape
        rnn = SimpleRNN(M, V)
        rnn.set_weights(Wx, Wh, Wo, bh, h0, bo, activation)
        return rnn

In [19]:
def load_data(data_file=None):
    if data_file == None:
        return
    with open(data_file) as f:
        data = json.load(f)
    return data

In [91]:
folder = '../data/large_files/stanford_sentiment/parsed_data/'
word2idx = load_data(folder + "sentiment_word2idx.json")
sentiment_binary_train = load_data(folder + "sentiment_binary_train.json")
sentiment_train = load_data(folder + "sentiment_train.json")
sentiment_binary_test = load_data(folder + "sentiment_binary_test.json")
sentiment_test = load_data(folder + "sentiment_test.json")

print(len(sentiment_binary_train))
print(len(sentiment_binary_test))
print("Load data finished")
print(len(word2idx))

8544
2210
Load data finished
18647


In [37]:
stop_words = set(('his', 'same', 'at', 'about', 'where', 'few', 'of', 'her', 'he', 'theirs', 
                  'you', 'been', 'through', 'had', 'my', 'than', 'out', 'the', 
                 'in', 'your', 'yourself', 'between', 'hers', 'only', 'should', 'ours', 'those', 
                 'am', 'as', 'its', 'yours', 'so', 'an', 'how', 'now', 'having', 
                 'doing', 'some', 'below', 'down', 'such', 'have', 'is', 'each', 
                 'just', 'to', 'too', 'they', 'own', 'who', 'what', 'ourselves', 
                 'ma', 'during', 'for', 'these', 'be', 'being', 'most', 'himself', 'do', 'into', 'our', 
                 'over', 'off', 'she', 'has', 'this', 'above', 'other', 'more', 'before', 
                 'a', 'does', 'can', 'there', 'by', 'why', 'their', 'both', 'all', 'y', 'under', 
                 'themselves', 'which', 'him', 'on', 'here', 'then', 'herself', 
                 'itself', 'until', 'me', 'that', 'did', 'while', 'whom', 'were', 'or', 
                 'was', 'from', 'very', 'up', 'with', 'again', 'are', 'because', 'yourselves', 'further', 
                 'them', 'myself', 'after', 'we', 'it', 're', 'will', 'when'))
print(stop_words)
print(len(stop_words))

{'his', 'same', 'at', 'about', 'where', 'few', 'of', 'her', 'he', 'theirs', 'you', 'been', 'through', 'had', 'my', 'than', 'out', 'the', 'in', 'your', 'yourself', 'between', 'hers', 'only', 'should', 'ours', 'those', 'am', 'as', 'its', 'yours', 'so', 'an', 'how', 'now', 'having', 'doing', 'some', 'below', 'down', 'such', 'have', 'is', 'each', 'just', 'to', 'too', 'they', 'own', 'who', 'what', 'ourselves', 'ma', 'during', 'for', 'these', 'be', 'being', 'most', 'himself', 'do', 'into', 'our', 'over', 'off', 'she', 'has', 'this', 'above', 'other', 'more', 'before', 'a', 'does', 'can', 'there', 'by', 'why', 'their', 'both', 'all', 'y', 'under', 'themselves', 'which', 'him', 'on', 'here', 'then', 'herself', 'itself', 'until', 'me', 'that', 'did', 'while', 'whom', 'were', 'or', 'was', 'from', 'very', 'up', 'with', 'again', 'are', 'because', 'yourselves', 'further', 'them', 'myself', 'after', 'we', 'it', 're', 'will', 'when'}
117


In [106]:
def get_comment_idx(wordidx, idx2word:dict, word2idx:dict):
    wordlist = []
    for idx in wordidx:
        if idx != -1:
            token = idx2word[idx]
            if token not in string.punctuation and token not in stop_words:
                wordlist.append(token)
    return wordlist

In [107]:
def get_comments_samples_idx(samples:dict, idx2word:dict, word2idx:dict):
    comments = []
    targets = []
    for _, v in samples.items():
        if v[3][-1] != -1:
            comment = get_comment_idx(v[0], idx2word, word2idx)
            
            sequence = np.array([word2idx[w] for w in comment])
            
            if sequence.shape[0] != 0:
                comments.append(sequence)
                label = v[3][-1]
                targets.append(label) 
            else:
                print("found", sequence)
            
            
    return comments, targets

In [108]:
idx2word = {v:k for k, v in word2idx.items()}
train_comments, train_targets = get_comments_samples_idx(sentiment_binary_train, idx2word, word2idx)
test_comments, test_targets = get_comments_samples_idx(sentiment_binary_test, idx2word, word2idx)

count0 = 0
count1 = 0
count2 = 0
for i in range(len(train_comments)):

    if train_targets[i] == 0:
        count0 += 1
    elif train_targets[i] == 1:
        count1 += 1
    else:
        count2 += 1
#     print(i, comments[i], targets[i])
    
print(train_comments[0], "len:", len(train_comments[0]))
print(train_comments[1], "len:", len(train_comments[1]))
print(train_comments[2], "len:", len(train_comments[2]))

for i in range(len(train_comments)):
    count = 0
    for w in train_comments[i]:
        count += 1
    if count != train_comments[i].shape[0]:
        print("NOT MATCH")
    if train_comments[i].shape[0] == 0:
        print("found", i)

        
print("sss:", train_comments[4302])
print("0", count0)
print("1", count1)
print("-1", count2)

found []
[ 2671  4218 13946 14490   420  2393    13 12162  7991  5334  1326  1899
  2265  1509  3150] len: 15
[ 3485     8 11456] len: 3
[ 3814    13   694   597  3068   329   147 14110  9603    70     8   318
   497  2684  2614   284  1228  1261    13  9603  1574 10043 14111 14112] len: 24
sss: [  345  1568  1082 12509     8 12573  3242  6289 13025]
0 3309
1 3610
-1 0


In [109]:
X = train_comments
Y = train_targets
rnn = SimpleRNN(30, len(word2idx))
rnn.fit(X, Y, learning_rate=10e-3, show_fig=True, activation=T.nnet.relu, epochs=40)

V: 18647
# of classes: 2
learning rate: 0.01
mu: 0.99
reg: 1.0
epochs: 40
start training: 
i: 0 cost: 6027.4992407 correct rate: 0.4908090895932841cost so far: 6027.499241
validation correct rate: 0.4
epoch: 2, j/N: 2767/6909 correct rate so far: 0.205384, cost so far: 2221.328735

KeyboardInterrupt: 