From 428580c4591dfe2b984201582ed7a6ffad14258c Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Sun, 8 Mar 2015 16:55:33 +0200 Subject: [PATCH] training code runnable now, still tuning the parameters --- ptb.py | 11 +++++- recnn.py | 21 ++++++++--- recnn_train.py | 93 +++++++++++++++++++++++++++++++++++++---------- recnn_util.py | 99 +++++++++++++++++++++++++++++++++++++++++--------- test_recnn.py | 12 +++++- 5 files changed, 190 insertions(+), 46 deletions(-) diff --git a/ptb.py b/ptb.py index 4dfd4fc..95493ed 100644 --- a/ptb.py +++ b/ptb.py @@ -113,17 +113,24 @@ def collect_words(tree): ) def get_leaves_with_labels(tree): - """return leaves in the tree, as well as their labels + """ + Return leaves in the tree, as well as their labels + >>> from ptb import parse >>> t = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))") >>> get_leaves_with_labels(t) [('A', 2), ('warm', 3), (',', 2), ('funny', 3), (',', 2), ('engaging', 4), ('film', 2), ('.', 2)] + >>> t = parse("(2 .)") + """ + def aux(t): if len(t) == 2: # leaf return [(t[1], t[0])] - else: + elif len(t) == 3: return aux(t[1]) + aux(t[2]) + else: + raise ValueError("length shoud be 2,3 or 4 for input '%r'" %(t,)) return aux(tree) diff --git a/recnn.py b/recnn.py index 0c1034f..ea7eeea 100644 --- a/recnn.py +++ b/recnn.py @@ -85,10 +85,14 @@ def load_from_theano_model(cls, model, word2id): def get_node_vector(self, node): if isinstance(node, tuple): # is internal node - assert len(node) == 3 - left_node_vector = self.get_node_vector(node[1]) - right_node_vector = self.get_node_vector(node[2]) - return self.rntn_layer.output(left_node_vector, right_node_vector) + if len(node) == 3: + left_node_vector = self.get_node_vector(node[1]) + right_node_vector = self.get_node_vector(node[2]) + return self.rntn_layer.output(left_node_vector, right_node_vector) + elif len(node) == 2: + return self.get_node_vector(node[1]) + else: + raise ValueError("Invalid tuple length(should be 2 or 3)") else: assert isinstance(node, basestring) idx = (self.word2id[node] @@ -97,6 +101,11 @@ def get_node_vector(self, node): return self.embedding[idx] - def predict(self, node): + def predict_all_nodes(self, nodes): + raise NotImplementedError + + def predict_top_node(self, node): vec = self.get_node_vector(node) - return self.logreg_layer.predict(vec) + return self.logreg_layer.predict(vec)[0] + + diff --git a/recnn_train.py b/recnn_train.py index 5d8bc98..0afd445 100644 --- a/recnn_train.py +++ b/recnn_train.py @@ -6,6 +6,7 @@ Socher, 2013, Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank """ +import sys import theano import theano.tensor as T @@ -13,6 +14,13 @@ from logreg import LogisticRegression +from recnn import RNTN as NumpyRNTN +from recnn_util import (collect_nodes, + replace_tokens_by_condition, + build_input, + build_node_id_mapping) +from adadelta import build_adadelta_updates + class RNTNLayer(object): """ Recursive Tensor Neural Network layer that outputs: @@ -64,6 +72,9 @@ def __init__(self, name = "W", borrow = True ) + + self.params = [self.V, self.W] + self.param_shapes = [self.V.get_value().shape, self.W.get_value().shape] def output(self, left_input, right_input): """ @@ -150,8 +161,7 @@ def update_embedding(child_indices, my_index, embedding): embedding, # if no child, return the word embedding T.set_subtensor(embedding[my_index], # otherwise, compute the embedding of RNTN layer self.rntn_layer.output(embedding[child_indices[0]], - embedding[child_indices[1]]) - # embedding[child_indices[0]] + embedding[child_indices[1]] + embedding[child_indices[1]]) ) ) @@ -173,33 +183,78 @@ def update_embedding(child_indices, my_index, embedding): n_out = label_n ) - self.cost = self.logreg_layer.nnl(y) + cost = self.logreg_layer.nnl(y) - self.params = [self.logreg_layer.W, self.logreg_layer.b, self.rntn_layer.V, self.rntn_layer.W, self.embedding] - self.grads = [T.grad(cost = self.cost, wrt=p) for p in self.params] + params = self.logreg_layer.params + self.rntn_layer.params + [self.embedding] + self.params = params + + param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [(vocab_size, embed_dim)] + + grads = [T.grad(cost = cost, wrt=p) for p in params] + + updates = build_adadelta_updates(params, param_shapes, grads, epsilon = 0.1) # TODO: in this step, forward propagation is done again besides the one in `update_embedding` # this extra computation should be avoided self.train = theano.function(inputs = [x, y], - updates = [(p, p - 10*g) - for p,g in zip(self.params, self.grads)]) + updates = updates) -def main(): - # shuffle data - from recnn_util import load_trees - from codecs import open +def main(batch_size = 3): - train_trees = load_trees(open("data/stanfordSentimentTreebank/trees/train.txt", "r", "utf8")) - dev_trees = load_trees(open("data/stanfordSentimentTreebank/trees/dev.txt", "r", "utf8")) - test_trees = load_trees(open("data/stanfordSentimentTreebank/trees/test.txt", "r", "utf8")) + import random + from recnn_util import load_data + + train_trees, dev_trees, test_trees, token2id = load_data("data/stanford_sentiment_treebank.pkl") + sys.stderr.write("Data load done") + + batch_number = len(train_trees) / batch_size - nodes = collect_nodes(trees) - token2id, _ = build_node_id_mapping(nodes) + x = T.imatrix('x') + y = T.ivector('y') - # for each mini-batch in - build_tree_matrix - # train the model() + model = RNTN( + x, y, + vocab_size = len(token2id), + embed_dim = 10, + label_n = 5, + ) + sys.stderr.write("Model compilation done\n") + + training_iter = 0 + validation_frequency = 10 + + print "start training.." + while True: + # shuffle data + random.shuffle(train_trees) + # for each mini-batch in + for i in xrange(batch_number): + training_iter += 1 + + batch_trees = train_trees[i*batch_size:(i+1)*batch_size] + batch_nodes = collect_nodes(batch_trees) + x,y = build_input(batch_nodes, token2id) + + # train the model() + model.update_embedding(x) + model.train(x, y) + + print "At iter %d" %(training_iter) + if training_iter % validation_frequency == 0: + classifier = NumpyRNTN.load_from_theano_model(model, token2id) + + def accuracy(trees): + prediction = np.array([classifier.predict_top_node(tree) for tree in trees]) + correct = np.array([tree[0] for tree in trees]) + return np.mean(prediction == correct) + + print "At iter %d, train accuracy %.2f%%, dev accuracy %.2f%%" %(training_iter, + accuracy(train_trees) * 100, + accuracy(dev_trees) * 100) + +if __name__ == "__main__": + main() diff --git a/recnn_util.py b/recnn_util.py index 28f6f47..8fa2151 100644 --- a/recnn_util.py +++ b/recnn_util.py @@ -1,11 +1,21 @@ """ Utility for RecNN """ +import sys import numpy as np import operator -import ptb from collections import OrderedDict +import ptb +from tree_stat import token_freq +import codecs + +try: + import cPickle as pickle +except: + import pickle + +UNK_TOKEN = "" class CannotMergeAnyMoreException(Exception): pass @@ -52,7 +62,8 @@ def collect_nodes(trees): >>> from ptb import parse >>> t1 = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))") >>> t2 = parse("(0 (0 (2 A) (0 (0 (0 boring) (2 ,)) (0 bad))) (1 (2 ,) (1 (1 (1 unsatisfactory) (2 film)) (2 .))))") - >>> data = collect_nodes([t1, t2]) + >>> t3 = parse("(2 film)") # some repeatition + >>> data = collect_nodes([t1, t2, t3]) >>> len(data) 24 >>> data[-1] @@ -81,8 +92,12 @@ def collect_nodes(trees): for token, label in tokens_with_labels: if token not in collected_tokens: new_tokens_with_labels.append((token, label)) - - tokens, labels = zip(*new_tokens_with_labels) + + + if new_tokens_with_labels: + tokens, labels = zip(*new_tokens_with_labels) + else: + continue # nothing to add # add new tokens, their children and their labels all_tokens += [ @@ -104,7 +119,7 @@ def collect_nodes(trees): return all_tokens -def replace_tokens_by_condition(nodes, condition_func, to_token = ""): +def replace_tokens_by_condition(nodes, condition_func, to_token = UNK_TOKEN, to_label = 3): """ Replace tokens to target token by certain condition @@ -113,9 +128,9 @@ def replace_tokens_by_condition(nodes, condition_func, to_token = ""): >>> nodes = [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)] >>> condition_func = lambda w: c[w] < 5 # `engaging` and `warm` should be filtered out >>> replace_tokens_by_condition(nodes, condition_func, to_token = "") - [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('film', None, None, 2), ('A', None, None, 2), (('warm', ','), '', ',', 3), (('engaging', 'film'), '', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)] + [('', None, None, 3), ('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('film', None, None, 2), ('A', None, None, 2), (('warm', ','), '', ',', 3), (('engaging', 'film'), '', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)] """ - new_nodes = [] + new_nodes = [(to_token, None, None, to_label)] # to_token should be added also for node in nodes: parent, lchild, rchild, label = node @@ -155,7 +170,7 @@ def build_node_id_mapping(nodes): return mapping def build_input(nodes, token2id): - """ + """ Param: ---------- the tree nodes and token to index mapping @@ -165,23 +180,71 @@ def build_input(nodes, token2id): 1. tree matrix: numpy.array, Nx3, (token id, left child id, right child id) 2. labels: numpy.array, 1xN or Nx1 - >>> token2id = OrderedDict([('funny', 0), (',', 1), ('.', 2), ('engaging', 3), ('film', 4), ('warm', 5), ('A', 6), (('warm', ','), 7), (('engaging', 'film'), 8), ((('warm', ','), 'funny'), 9), ((('engaging', 'film'), '.'), 10), (('A', (('warm', ','), 'funny')), 11), ((',', (('engaging', 'film'), '.')), 12), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), 13)]) - >>> nodes = [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)] + >>> token2id = OrderedDict([('', 14), ('funny', 0), (',', 1), ('.', 2), ('engaging', 3), ('film', 4), ('warm', 5), ('A', 6), (('warm', ','), 7), (('engaging', 'film'), 8), ((('warm', ','), 'funny'), 9), ((('engaging', 'film'), '.'), 10), (('A', (('warm', ','), 'funny')), 11), ((',', (('engaging', 'film'), '.')), 12), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), 13)]) + >>> nodes = [('balhword', None, None, 3), ('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)] >>> x, y = build_input(nodes, token2id) >>> x # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - array([[ 0, -1, -1], + array([[14, -1, -1], + [ 0, -1, -1], [ 1, -1, -1], ... - [13, 11, 12]]) + [13, 11, 12]], dtype=int32) >>> y # doctest: +ELLIPSIS - array([3, 2, 2,..., 4]) + array([3, 3, 2, 2,..., 4], dtype=int32) """ - x = np.array([[token2id[t1], token2id.get(t2, -1), token2id.get(t3, -1)] - for t1,t2,t3,_ in nodes]) - y = np.array([y for _,_,_,y in nodes]) + x_array = [] + for t1, t2, t3, _ in nodes: + if t1 in token2id: + x_array.append([token2id[t1], token2id.get(t2, -1), token2id.get(t3, -1)]) + else: # cope with unknown words + x_array.append([token2id[UNK_TOKEN], token2id.get(t2, -1), token2id.get(t3, -1)]) + + x = np.asarray(x_array, dtype=np.int32) + y = np.asarray([y for _,_,_,y in nodes], dtype=np.int32) return x, y +def dump_data(train_path, dev_path, test_path, output_path = "data/stanford_sentiment_treebank.pkl"): + sys.stderr.write("loading trees..\n") + train_trees = ptb.load_trees(codecs.open(train_path, "r", "utf8")) + dev_trees = ptb.load_trees(codecs.open(dev_path, "r", "utf8")) + test_trees = ptb.load_trees(codecs.open(test_path, "r", "utf8")) + + nodes = collect_nodes(train_trees) + freq_table = token_freq(train_trees) + rare_condition = lambda w: freq_table[w] < 5 + + sys.stderr.write("preprocessing trees..\n") + nodes = replace_tokens_by_condition(nodes, rare_condition) + + sys.stderr.write("get vocabulary size\n") + word_number = len(filter(lambda node: node[1] is None, nodes)) + sys.stderr.write("word_number = %d\n" %(word_number)) + + token2id = build_node_id_mapping(nodes) + + assert "" in token2id, " should be in `token2id`" + + + data = (train_trees, dev_trees, test_trees, token2id) + + pickle.dump(data, open(output_path, "w")) + + return data + +def load_data(path = "data/stanford_sentiment_treebank.pkl"): + """ + >>> data1 = dump_data("data/unittest_data/train.txt", \ + "data/unittest_data/dev.txt", \ + "data/unittest_data/test.txt",\ + "data/unittest_data/dump.pkl") + >>> data2 = load_data("data/unittest_data/dump.pkl") + >>> data1 == data2 + True + """ + return pickle.load(open(path, "r")) + if __name__ == "__main__": - import doctest - doctest.testmod() + dump_data("data/stanfordSentimentTreebank/trees/train.txt", + "data/stanfordSentimentTreebank/trees/dev.txt", + "data/stanfordSentimentTreebank/trees/test.txt") diff --git a/test_recnn.py b/test_recnn.py index 0cefe08..bcd66a3 100644 --- a/test_recnn.py +++ b/test_recnn.py @@ -33,10 +33,20 @@ [3, 1, 4]], dtype=np.int32) -actual = np_model.get_node_vector((("love", ("you", "bro")), "love", (("you", "bro"), "you", "bro"))) +tree_input = (5, "love", (3, (3, "you"), (3, "bro"))) +actual = np_model.get_node_vector(tree_input) th_model.update_embedding(x_input) expected = th_model.embedding.get_value()[3] assert_matrix_eq(actual, expected, "node vector") + +get_label = theano.function(inputs = [x], + outputs = th_model.logreg_layer.pred_y) + +score = np_model.predict_top_node(tree_input) + +assert isinstance(score, np.int64) + +assert_matrix_eq(score, get_label(x_input[1:2,:]), 'logreg.predict')