From 428580c4591dfe2b984201582ed7a6ffad14258c Mon Sep 17 00:00:00 2001
From: Han Xiao <han.xiao@cs.helsinki.fi>
Date: Sun, 8 Mar 2015 16:55:33 +0200
Subject: [PATCH] training code runnable now, still tuning the parameters

---
 ptb.py         | 11 +++++-
 recnn.py       | 21 ++++++++---
 recnn_train.py | 93 +++++++++++++++++++++++++++++++++++++----------
 recnn_util.py  | 99 +++++++++++++++++++++++++++++++++++++++++---------
 test_recnn.py  | 12 +++++-
 5 files changed, 190 insertions(+), 46 deletions(-)

diff --git a/ptb.py b/ptb.py
index 4dfd4fc..95493ed 100644
--- a/ptb.py
+++ b/ptb.py
@@ -113,17 +113,24 @@ def collect_words(tree):
         )
 
 def get_leaves_with_labels(tree):
-    """return leaves in the tree, as well as their labels
+    """
+    Return leaves in the tree, as well as their labels
+    
     >>> from ptb import parse
     >>> t = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))")
     >>> get_leaves_with_labels(t)
     [('A', 2), ('warm', 3), (',', 2), ('funny', 3), (',', 2), ('engaging', 4), ('film', 2), ('.', 2)]
+    >>> t = parse("(2 .)")
+    
     """
+    
     def aux(t):
         if len(t) == 2: # leaf
             return [(t[1], t[0])]
-        else:
+        elif len(t) == 3:
             return aux(t[1]) + aux(t[2])
+        else:
+            raise ValueError("length shoud be 2,3 or 4 for input '%r'" %(t,))
 
     return aux(tree)
 
diff --git a/recnn.py b/recnn.py
index 0c1034f..ea7eeea 100644
--- a/recnn.py
+++ b/recnn.py
@@ -85,10 +85,14 @@ def load_from_theano_model(cls, model, word2id):
 
     def get_node_vector(self, node):
         if isinstance(node, tuple): # is internal node
-            assert len(node) == 3
-            left_node_vector = self.get_node_vector(node[1])
-            right_node_vector = self.get_node_vector(node[2])
-            return self.rntn_layer.output(left_node_vector, right_node_vector)
+            if len(node) == 3:
+                left_node_vector = self.get_node_vector(node[1])
+                right_node_vector = self.get_node_vector(node[2])
+                return self.rntn_layer.output(left_node_vector, right_node_vector)
+            elif len(node) == 2:
+               return self.get_node_vector(node[1])
+            else:
+                raise ValueError("Invalid tuple length(should be 2 or 3)")
         else:
             assert isinstance(node, basestring)
             idx = (self.word2id[node] 
@@ -97,6 +101,11 @@ def get_node_vector(self, node):
             
             return self.embedding[idx]
 
-    def predict(self, node):
+    def predict_all_nodes(self, nodes):
+        raise NotImplementedError
+
+    def predict_top_node(self, node):
         vec = self.get_node_vector(node)
-        return self.logreg_layer.predict(vec)
+        return self.logreg_layer.predict(vec)[0]
+
+
diff --git a/recnn_train.py b/recnn_train.py
index 5d8bc98..0afd445 100644
--- a/recnn_train.py
+++ b/recnn_train.py
@@ -6,6 +6,7 @@
 Socher, 2013, Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank
 
 """
+import sys
 import theano
 import theano.tensor as T
 
@@ -13,6 +14,13 @@
 
 from logreg import LogisticRegression
 
+from recnn import RNTN as NumpyRNTN
+from recnn_util import (collect_nodes,
+                        replace_tokens_by_condition,
+                        build_input,
+                        build_node_id_mapping)
+from adadelta import build_adadelta_updates
+
 class RNTNLayer(object):
     """ Recursive Tensor Neural Network layer
     that outputs:
@@ -64,6 +72,9 @@ def __init__(self,
                 name = "W",
                 borrow = True
             )                
+            
+        self.params = [self.V, self.W]
+        self.param_shapes = [self.V.get_value().shape, self.W.get_value().shape]
         
     def output(self, left_input, right_input):
         """
@@ -150,8 +161,7 @@ def update_embedding(child_indices, my_index, embedding):
                             embedding, # if no child, return the word embedding
                             T.set_subtensor(embedding[my_index], # otherwise, compute the embedding of RNTN layer
                                             self.rntn_layer.output(embedding[child_indices[0]], 
-                                                              embedding[child_indices[1]])
-                                            # embedding[child_indices[0]] + embedding[child_indices[1]]
+                                                                   embedding[child_indices[1]])
                                         )
             )
             
@@ -173,33 +183,78 @@ def update_embedding(child_indices, my_index, embedding):
                                           n_out = label_n
         )
         
-        self.cost = self.logreg_layer.nnl(y)
+        cost = self.logreg_layer.nnl(y)
 
-        self.params = [self.logreg_layer.W, self.logreg_layer.b, self.rntn_layer.V, self.rntn_layer.W, self.embedding]
-        self.grads = [T.grad(cost = self.cost, wrt=p) for p in self.params]
+        params = self.logreg_layer.params + self.rntn_layer.params + [self.embedding]
+        self.params = params
+
+        param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [(vocab_size, embed_dim)]
+        
+        grads = [T.grad(cost = cost, wrt=p) for p in params]
+        
+        updates = build_adadelta_updates(params, param_shapes, grads, epsilon = 0.1)
         
         # TODO: in this step, forward propagation is done again besides the one in `update_embedding`
         #       this extra computation should be avoided
         self.train = theano.function(inputs = [x, y], 
-                                     updates = [(p, p - 10*g) 
-                                                for p,g in zip(self.params, self.grads)])
+                                     updates = updates)
 
 
-def main():
-    # shuffle data
-    from recnn_util import load_trees
-    from codecs import open
+def main(batch_size = 3):
 
-    train_trees = load_trees(open("data/stanfordSentimentTreebank/trees/train.txt", "r", "utf8"))
-    dev_trees = load_trees(open("data/stanfordSentimentTreebank/trees/dev.txt", "r", "utf8"))
-    test_trees = load_trees(open("data/stanfordSentimentTreebank/trees/test.txt", "r", "utf8"))
+    import random
+    from recnn_util import load_data
+    
+    train_trees, dev_trees, test_trees, token2id = load_data("data/stanford_sentiment_treebank.pkl")
+    sys.stderr.write("Data load done")
+    
+    batch_number = len(train_trees) / batch_size
     
-    nodes = collect_nodes(trees)
-    token2id, _ = build_node_id_mapping(nodes)
+    x = T.imatrix('x')
+    y = T.ivector('y')
     
-    # for each mini-batch in 
-    build_tree_matrix
-    # train the model()
+    model = RNTN(
+        x, y,
+        vocab_size = len(token2id), 
+        embed_dim = 10, 
+        label_n = 5,
+    )
     
+    sys.stderr.write("Model compilation done\n")
+    
+    training_iter = 0
+    validation_frequency = 10
+    
+    print "start training.."
+    while True:
+        # shuffle data
+        random.shuffle(train_trees)
+        # for each mini-batch in 
+        for i in xrange(batch_number):
+            training_iter += 1
+            
+            batch_trees = train_trees[i*batch_size:(i+1)*batch_size]
+            batch_nodes = collect_nodes(batch_trees)
+            x,y = build_input(batch_nodes, token2id)
+         
+            # train the model()
+            model.update_embedding(x)
+            model.train(x, y)
+            
+            print "At iter %d" %(training_iter)
 
+            if training_iter % validation_frequency == 0:
+                classifier = NumpyRNTN.load_from_theano_model(model, token2id)
+
+                def accuracy(trees):
+                    prediction = np.array([classifier.predict_top_node(tree) for tree in trees])
+                    correct = np.array([tree[0] for tree in trees])
+                    return np.mean(prediction == correct)
+
+                print "At iter %d, train accuracy %.2f%%, dev accuracy %.2f%%" %(training_iter, 
+                                                                                 accuracy(train_trees) * 100,
+                                                                                 accuracy(dev_trees) * 100)
+                
     
+if __name__ == "__main__":
+    main()
diff --git a/recnn_util.py b/recnn_util.py
index 28f6f47..8fa2151 100644
--- a/recnn_util.py
+++ b/recnn_util.py
@@ -1,11 +1,21 @@
 """
 Utility for RecNN
 """
+import sys
 import numpy as np
 import operator
-import ptb
 from collections import OrderedDict
 
+import ptb
+from tree_stat import token_freq
+import codecs 
+
+try:
+    import cPickle as pickle
+except:
+    import pickle
+
+UNK_TOKEN = "<UNK>"
 
 class CannotMergeAnyMoreException(Exception):
     pass
@@ -52,7 +62,8 @@ def collect_nodes(trees):
     >>> from ptb import parse
     >>> t1 = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))")
     >>> t2 = parse("(0 (0 (2 A) (0 (0 (0 boring) (2 ,)) (0 bad))) (1 (2 ,) (1 (1 (1 unsatisfactory) (2 film)) (2 .))))")
-    >>> data = collect_nodes([t1, t2])
+    >>> t3 = parse("(2 film)") # some repeatition
+    >>> data = collect_nodes([t1, t2, t3])
     >>> len(data)
     24
     >>> data[-1]
@@ -81,8 +92,12 @@ def collect_nodes(trees):
             for token, label in tokens_with_labels:
                 if token not in collected_tokens:
                     new_tokens_with_labels.append((token, label))
-                
-            tokens, labels = zip(*new_tokens_with_labels)
+            
+
+            if new_tokens_with_labels:
+                tokens, labels = zip(*new_tokens_with_labels)
+            else:
+                continue # nothing to add
                         
             # add new tokens, their children and their labels
             all_tokens += [
@@ -104,7 +119,7 @@ def collect_nodes(trees):
     
     return all_tokens
 
-def replace_tokens_by_condition(nodes, condition_func, to_token = "<UNK>"):
+def replace_tokens_by_condition(nodes, condition_func, to_token = UNK_TOKEN, to_label = 3):
     """
     Replace tokens to target token by certain condition
 
@@ -113,9 +128,9 @@ def replace_tokens_by_condition(nodes, condition_func, to_token = "<UNK>"):
     >>> nodes = [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
     >>> condition_func = lambda w: c[w] < 5 # `engaging` and `warm` should be filtered out
     >>> replace_tokens_by_condition(nodes, condition_func, to_token = "<UNK>")
-    [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('film', None, None, 2), ('A', None, None, 2), (('warm', ','), '<UNK>', ',', 3), (('engaging', 'film'), '<UNK>', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
+    [('<UNK>', None, None, 3), ('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('film', None, None, 2), ('A', None, None, 2), (('warm', ','), '<UNK>', ',', 3), (('engaging', 'film'), '<UNK>', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
     """
-    new_nodes = []
+    new_nodes = [(to_token, None, None, to_label)] # to_token should be added also
     for node in nodes:
         parent, lchild, rchild, label = node
 
@@ -155,7 +170,7 @@ def build_node_id_mapping(nodes):
     return mapping
 
 def build_input(nodes, token2id):
-    """    
+    """
     Param:
     ----------
     the tree nodes and token to index mapping
@@ -165,23 +180,71 @@ def build_input(nodes, token2id):
     1. tree matrix: numpy.array, Nx3, (token id, left child id, right child id)
     2. labels: numpy.array, 1xN or Nx1
 
-    >>> token2id = OrderedDict([('funny', 0), (',', 1), ('.', 2), ('engaging', 3), ('film', 4), ('warm', 5), ('A', 6), (('warm', ','), 7), (('engaging', 'film'), 8), ((('warm', ','), 'funny'), 9), ((('engaging', 'film'), '.'), 10), (('A', (('warm', ','), 'funny')), 11), ((',', (('engaging', 'film'), '.')), 12), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), 13)])
-    >>> nodes = [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
+    >>> token2id = OrderedDict([('<UNK>', 14), ('funny', 0), (',', 1), ('.', 2), ('engaging', 3), ('film', 4), ('warm', 5), ('A', 6), (('warm', ','), 7), (('engaging', 'film'), 8), ((('warm', ','), 'funny'), 9), ((('engaging', 'film'), '.'), 10), (('A', (('warm', ','), 'funny')), 11), ((',', (('engaging', 'film'), '.')), 12), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), 13)])
+    >>> nodes = [('balhword', None, None, 3), ('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
     >>> x, y = build_input(nodes, token2id)
     >>> x # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    array([[ 0, -1, -1], 
+    array([[14, -1, -1],
+           [ 0, -1, -1], 
            [ 1, -1, -1], 
     ...
-           [13, 11, 12]])
+           [13, 11, 12]], dtype=int32)
     >>> y # doctest: +ELLIPSIS
-    array([3, 2, 2,..., 4])
+    array([3, 3, 2, 2,..., 4], dtype=int32)
     """
-    x = np.array([[token2id[t1], token2id.get(t2, -1), token2id.get(t3, -1)]
-                  for t1,t2,t3,_ in nodes])
-    y = np.array([y for _,_,_,y in nodes])
+    x_array = []
+    for t1, t2, t3, _ in nodes:
+        if t1 in token2id:
+            x_array.append([token2id[t1], token2id.get(t2, -1), token2id.get(t3, -1)])
+        else: # cope with unknown words
+            x_array.append([token2id[UNK_TOKEN], token2id.get(t2, -1), token2id.get(t3, -1)])
+
+    x = np.asarray(x_array, dtype=np.int32)
+    y = np.asarray([y for _,_,_,y in nodes], dtype=np.int32)
 
     return x, y
 
+def dump_data(train_path, dev_path, test_path, output_path = "data/stanford_sentiment_treebank.pkl"):
+    sys.stderr.write("loading trees..\n")
+    train_trees = ptb.load_trees(codecs.open(train_path, "r", "utf8"))
+    dev_trees = ptb.load_trees(codecs.open(dev_path, "r", "utf8"))
+    test_trees = ptb.load_trees(codecs.open(test_path, "r", "utf8"))
+    
+    nodes = collect_nodes(train_trees)
+    freq_table = token_freq(train_trees)
+    rare_condition = lambda w: freq_table[w] < 5
+    
+    sys.stderr.write("preprocessing trees..\n")
+    nodes = replace_tokens_by_condition(nodes, rare_condition)
+    
+    sys.stderr.write("get vocabulary size\n")
+    word_number = len(filter(lambda node: node[1] is None, nodes))
+    sys.stderr.write("word_number = %d\n" %(word_number))
+    
+    token2id = build_node_id_mapping(nodes)        
+    
+    assert "<UNK>" in token2id, "<UNK> should be in `token2id`"
+
+
+    data = (train_trees, dev_trees, test_trees, token2id)
+    
+    pickle.dump(data, open(output_path, "w"))
+
+    return data
+
+def load_data(path = "data/stanford_sentiment_treebank.pkl"):
+    """
+    >>> data1 = dump_data("data/unittest_data/train.txt", \
+    "data/unittest_data/dev.txt", \
+    "data/unittest_data/test.txt",\
+    "data/unittest_data/dump.pkl")
+    >>> data2 = load_data("data/unittest_data/dump.pkl")
+    >>> data1 == data2
+    True
+    """
+    return pickle.load(open(path, "r"))
+
 if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
+    dump_data("data/stanfordSentimentTreebank/trees/train.txt", 
+              "data/stanfordSentimentTreebank/trees/dev.txt",
+              "data/stanfordSentimentTreebank/trees/test.txt")
diff --git a/test_recnn.py b/test_recnn.py
index 0cefe08..bcd66a3 100644
--- a/test_recnn.py
+++ b/test_recnn.py
@@ -33,10 +33,20 @@
                       [3, 1, 4]],
                      dtype=np.int32)
 
-actual = np_model.get_node_vector((("love", ("you", "bro")), "love", (("you", "bro"), "you", "bro")))
+tree_input = (5, "love", (3, (3, "you"), (3, "bro")))
+actual = np_model.get_node_vector(tree_input)
 
 th_model.update_embedding(x_input)
 
 expected = th_model.embedding.get_value()[3]
 
 assert_matrix_eq(actual, expected, "node vector")
+
+get_label = theano.function(inputs = [x], 
+                            outputs = th_model.logreg_layer.pred_y)
+
+score = np_model.predict_top_node(tree_input)
+
+assert isinstance(score, np.int64)
+
+assert_matrix_eq(score, get_label(x_input[1:2,:]), 'logreg.predict')