Skip to content

Commit

Permalink
training code runnable now, still tuning the parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaohan2012 committed Mar 8, 2015
1 parent d454054 commit 428580c
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 46 deletions.
11 changes: 9 additions & 2 deletions ptb.py
Expand Up @@ -113,17 +113,24 @@ def collect_words(tree):
)

def get_leaves_with_labels(tree):
"""return leaves in the tree, as well as their labels
"""
Return leaves in the tree, as well as their labels
>>> from ptb import parse
>>> t = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))")
>>> get_leaves_with_labels(t)
[('A', 2), ('warm', 3), (',', 2), ('funny', 3), (',', 2), ('engaging', 4), ('film', 2), ('.', 2)]
>>> t = parse("(2 .)")
"""

def aux(t):
if len(t) == 2: # leaf
return [(t[1], t[0])]
else:
elif len(t) == 3:
return aux(t[1]) + aux(t[2])
else:
raise ValueError("length shoud be 2,3 or 4 for input '%r'" %(t,))

return aux(tree)

Expand Down
21 changes: 15 additions & 6 deletions recnn.py
Expand Up @@ -85,10 +85,14 @@ def load_from_theano_model(cls, model, word2id):

def get_node_vector(self, node):
if isinstance(node, tuple): # is internal node
assert len(node) == 3
left_node_vector = self.get_node_vector(node[1])
right_node_vector = self.get_node_vector(node[2])
return self.rntn_layer.output(left_node_vector, right_node_vector)
if len(node) == 3:
left_node_vector = self.get_node_vector(node[1])
right_node_vector = self.get_node_vector(node[2])
return self.rntn_layer.output(left_node_vector, right_node_vector)
elif len(node) == 2:
return self.get_node_vector(node[1])
else:
raise ValueError("Invalid tuple length(should be 2 or 3)")
else:
assert isinstance(node, basestring)
idx = (self.word2id[node]
Expand All @@ -97,6 +101,11 @@ def get_node_vector(self, node):

return self.embedding[idx]

def predict(self, node):
def predict_all_nodes(self, nodes):
raise NotImplementedError

def predict_top_node(self, node):
vec = self.get_node_vector(node)
return self.logreg_layer.predict(vec)
return self.logreg_layer.predict(vec)[0]


93 changes: 74 additions & 19 deletions recnn_train.py
Expand Up @@ -6,13 +6,21 @@
Socher, 2013, Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank
"""
import sys
import theano
import theano.tensor as T

import numpy as np

from logreg import LogisticRegression

from recnn import RNTN as NumpyRNTN
from recnn_util import (collect_nodes,
replace_tokens_by_condition,
build_input,
build_node_id_mapping)
from adadelta import build_adadelta_updates

class RNTNLayer(object):
""" Recursive Tensor Neural Network layer
that outputs:
Expand Down Expand Up @@ -64,6 +72,9 @@ def __init__(self,
name = "W",
borrow = True
)

self.params = [self.V, self.W]
self.param_shapes = [self.V.get_value().shape, self.W.get_value().shape]

def output(self, left_input, right_input):
"""
Expand Down Expand Up @@ -150,8 +161,7 @@ def update_embedding(child_indices, my_index, embedding):
embedding, # if no child, return the word embedding
T.set_subtensor(embedding[my_index], # otherwise, compute the embedding of RNTN layer
self.rntn_layer.output(embedding[child_indices[0]],
embedding[child_indices[1]])
# embedding[child_indices[0]] + embedding[child_indices[1]]
embedding[child_indices[1]])
)
)

Expand All @@ -173,33 +183,78 @@ def update_embedding(child_indices, my_index, embedding):
n_out = label_n
)

self.cost = self.logreg_layer.nnl(y)
cost = self.logreg_layer.nnl(y)

self.params = [self.logreg_layer.W, self.logreg_layer.b, self.rntn_layer.V, self.rntn_layer.W, self.embedding]
self.grads = [T.grad(cost = self.cost, wrt=p) for p in self.params]
params = self.logreg_layer.params + self.rntn_layer.params + [self.embedding]
self.params = params

param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [(vocab_size, embed_dim)]

grads = [T.grad(cost = cost, wrt=p) for p in params]

updates = build_adadelta_updates(params, param_shapes, grads, epsilon = 0.1)

# TODO: in this step, forward propagation is done again besides the one in `update_embedding`
# this extra computation should be avoided
self.train = theano.function(inputs = [x, y],
updates = [(p, p - 10*g)
for p,g in zip(self.params, self.grads)])
updates = updates)


def main():
# shuffle data
from recnn_util import load_trees
from codecs import open
def main(batch_size = 3):

train_trees = load_trees(open("data/stanfordSentimentTreebank/trees/train.txt", "r", "utf8"))
dev_trees = load_trees(open("data/stanfordSentimentTreebank/trees/dev.txt", "r", "utf8"))
test_trees = load_trees(open("data/stanfordSentimentTreebank/trees/test.txt", "r", "utf8"))
import random
from recnn_util import load_data

train_trees, dev_trees, test_trees, token2id = load_data("data/stanford_sentiment_treebank.pkl")
sys.stderr.write("Data load done")

batch_number = len(train_trees) / batch_size

nodes = collect_nodes(trees)
token2id, _ = build_node_id_mapping(nodes)
x = T.imatrix('x')
y = T.ivector('y')

# for each mini-batch in
build_tree_matrix
# train the model()
model = RNTN(
x, y,
vocab_size = len(token2id),
embed_dim = 10,
label_n = 5,
)

sys.stderr.write("Model compilation done\n")

training_iter = 0
validation_frequency = 10

print "start training.."
while True:
# shuffle data
random.shuffle(train_trees)
# for each mini-batch in
for i in xrange(batch_number):
training_iter += 1

batch_trees = train_trees[i*batch_size:(i+1)*batch_size]
batch_nodes = collect_nodes(batch_trees)
x,y = build_input(batch_nodes, token2id)

# train the model()
model.update_embedding(x)
model.train(x, y)

print "At iter %d" %(training_iter)

if training_iter % validation_frequency == 0:
classifier = NumpyRNTN.load_from_theano_model(model, token2id)

def accuracy(trees):
prediction = np.array([classifier.predict_top_node(tree) for tree in trees])
correct = np.array([tree[0] for tree in trees])
return np.mean(prediction == correct)

print "At iter %d, train accuracy %.2f%%, dev accuracy %.2f%%" %(training_iter,
accuracy(train_trees) * 100,
accuracy(dev_trees) * 100)


if __name__ == "__main__":
main()
99 changes: 81 additions & 18 deletions recnn_util.py
@@ -1,11 +1,21 @@
"""
Utility for RecNN
"""
import sys
import numpy as np
import operator
import ptb
from collections import OrderedDict

import ptb
from tree_stat import token_freq
import codecs

try:
import cPickle as pickle
except:
import pickle

UNK_TOKEN = "<UNK>"

class CannotMergeAnyMoreException(Exception):
pass
Expand Down Expand Up @@ -52,7 +62,8 @@ def collect_nodes(trees):
>>> from ptb import parse
>>> t1 = parse("(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .))))")
>>> t2 = parse("(0 (0 (2 A) (0 (0 (0 boring) (2 ,)) (0 bad))) (1 (2 ,) (1 (1 (1 unsatisfactory) (2 film)) (2 .))))")
>>> data = collect_nodes([t1, t2])
>>> t3 = parse("(2 film)") # some repeatition
>>> data = collect_nodes([t1, t2, t3])
>>> len(data)
24
>>> data[-1]
Expand Down Expand Up @@ -81,8 +92,12 @@ def collect_nodes(trees):
for token, label in tokens_with_labels:
if token not in collected_tokens:
new_tokens_with_labels.append((token, label))

tokens, labels = zip(*new_tokens_with_labels)


if new_tokens_with_labels:
tokens, labels = zip(*new_tokens_with_labels)
else:
continue # nothing to add

# add new tokens, their children and their labels
all_tokens += [
Expand All @@ -104,7 +119,7 @@ def collect_nodes(trees):

return all_tokens

def replace_tokens_by_condition(nodes, condition_func, to_token = "<UNK>"):
def replace_tokens_by_condition(nodes, condition_func, to_token = UNK_TOKEN, to_label = 3):
"""
Replace tokens to target token by certain condition
Expand All @@ -113,9 +128,9 @@ def replace_tokens_by_condition(nodes, condition_func, to_token = "<UNK>"):
>>> nodes = [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
>>> condition_func = lambda w: c[w] < 5 # `engaging` and `warm` should be filtered out
>>> replace_tokens_by_condition(nodes, condition_func, to_token = "<UNK>")
[('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('film', None, None, 2), ('A', None, None, 2), (('warm', ','), '<UNK>', ',', 3), (('engaging', 'film'), '<UNK>', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
[('<UNK>', None, None, 3), ('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('film', None, None, 2), ('A', None, None, 2), (('warm', ','), '<UNK>', ',', 3), (('engaging', 'film'), '<UNK>', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
"""
new_nodes = []
new_nodes = [(to_token, None, None, to_label)] # to_token should be added also
for node in nodes:
parent, lchild, rchild, label = node

Expand Down Expand Up @@ -155,7 +170,7 @@ def build_node_id_mapping(nodes):
return mapping

def build_input(nodes, token2id):
"""
"""
Param:
----------
the tree nodes and token to index mapping
Expand All @@ -165,23 +180,71 @@ def build_input(nodes, token2id):
1. tree matrix: numpy.array, Nx3, (token id, left child id, right child id)
2. labels: numpy.array, 1xN or Nx1
>>> token2id = OrderedDict([('funny', 0), (',', 1), ('.', 2), ('engaging', 3), ('film', 4), ('warm', 5), ('A', 6), (('warm', ','), 7), (('engaging', 'film'), 8), ((('warm', ','), 'funny'), 9), ((('engaging', 'film'), '.'), 10), (('A', (('warm', ','), 'funny')), 11), ((',', (('engaging', 'film'), '.')), 12), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), 13)])
>>> nodes = [('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
>>> token2id = OrderedDict([('<UNK>', 14), ('funny', 0), (',', 1), ('.', 2), ('engaging', 3), ('film', 4), ('warm', 5), ('A', 6), (('warm', ','), 7), (('engaging', 'film'), 8), ((('warm', ','), 'funny'), 9), ((('engaging', 'film'), '.'), 10), (('A', (('warm', ','), 'funny')), 11), ((',', (('engaging', 'film'), '.')), 12), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), 13)])
>>> nodes = [('balhword', None, None, 3), ('funny', None, None, 3), (',', None, None, 2), ('.', None, None, 2), ('engaging', None, None, 4), ('film', None, None, 2), ('warm', None, None, 3), ('A', None, None, 2), (('warm', ','), 'warm', ',', 3), (('engaging', 'film'), 'engaging', 'film', 4), ((('warm', ','), 'funny'), ('warm', ','), 'funny', 4), ((('engaging', 'film'), '.'), ('engaging', 'film'), '.', 3), (('A', (('warm', ','), 'funny')), 'A', (('warm', ','), 'funny'), 4), ((',', (('engaging', 'film'), '.')), ',', (('engaging', 'film'), '.'), 3), ((('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.'))), ('A', (('warm', ','), 'funny')), (',', (('engaging', 'film'), '.')), 4)]
>>> x, y = build_input(nodes, token2id)
>>> x # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
array([[ 0, -1, -1],
array([[14, -1, -1],
[ 0, -1, -1],
[ 1, -1, -1],
...
[13, 11, 12]])
[13, 11, 12]], dtype=int32)
>>> y # doctest: +ELLIPSIS
array([3, 2, 2,..., 4])
array([3, 3, 2, 2,..., 4], dtype=int32)
"""
x = np.array([[token2id[t1], token2id.get(t2, -1), token2id.get(t3, -1)]
for t1,t2,t3,_ in nodes])
y = np.array([y for _,_,_,y in nodes])
x_array = []
for t1, t2, t3, _ in nodes:
if t1 in token2id:
x_array.append([token2id[t1], token2id.get(t2, -1), token2id.get(t3, -1)])
else: # cope with unknown words
x_array.append([token2id[UNK_TOKEN], token2id.get(t2, -1), token2id.get(t3, -1)])

x = np.asarray(x_array, dtype=np.int32)
y = np.asarray([y for _,_,_,y in nodes], dtype=np.int32)

return x, y

def dump_data(train_path, dev_path, test_path, output_path = "data/stanford_sentiment_treebank.pkl"):
sys.stderr.write("loading trees..\n")
train_trees = ptb.load_trees(codecs.open(train_path, "r", "utf8"))
dev_trees = ptb.load_trees(codecs.open(dev_path, "r", "utf8"))
test_trees = ptb.load_trees(codecs.open(test_path, "r", "utf8"))

nodes = collect_nodes(train_trees)
freq_table = token_freq(train_trees)
rare_condition = lambda w: freq_table[w] < 5

sys.stderr.write("preprocessing trees..\n")
nodes = replace_tokens_by_condition(nodes, rare_condition)

sys.stderr.write("get vocabulary size\n")
word_number = len(filter(lambda node: node[1] is None, nodes))
sys.stderr.write("word_number = %d\n" %(word_number))

token2id = build_node_id_mapping(nodes)

assert "<UNK>" in token2id, "<UNK> should be in `token2id`"


data = (train_trees, dev_trees, test_trees, token2id)

pickle.dump(data, open(output_path, "w"))

return data

def load_data(path = "data/stanford_sentiment_treebank.pkl"):
"""
>>> data1 = dump_data("data/unittest_data/train.txt", \
"data/unittest_data/dev.txt", \
"data/unittest_data/test.txt",\
"data/unittest_data/dump.pkl")
>>> data2 = load_data("data/unittest_data/dump.pkl")
>>> data1 == data2
True
"""
return pickle.load(open(path, "r"))

if __name__ == "__main__":
import doctest
doctest.testmod()
dump_data("data/stanfordSentimentTreebank/trees/train.txt",
"data/stanfordSentimentTreebank/trees/dev.txt",
"data/stanfordSentimentTreebank/trees/test.txt")
12 changes: 11 additions & 1 deletion test_recnn.py
Expand Up @@ -33,10 +33,20 @@
[3, 1, 4]],
dtype=np.int32)

actual = np_model.get_node_vector((("love", ("you", "bro")), "love", (("you", "bro"), "you", "bro")))
tree_input = (5, "love", (3, (3, "you"), (3, "bro")))
actual = np_model.get_node_vector(tree_input)

th_model.update_embedding(x_input)

expected = th_model.embedding.get_value()[3]

assert_matrix_eq(actual, expected, "node vector")

get_label = theano.function(inputs = [x],
outputs = th_model.logreg_layer.pred_y)

score = np_model.predict_top_node(tree_input)

assert isinstance(score, np.int64)

assert_matrix_eq(score, get_label(x_input[1:2,:]), 'logreg.predict')

0 comments on commit 428580c

Please sign in to comment.