In [1]:
import argparse
import codecs
import collections
import random
import re
import time

import numpy as np

import chainer
from chainer import cuda
import chainer.functions as F
import chainer.links as L
from chainer import optimizers

In [2]:
xp = np

#n_epoch = 400       # number of epochs
n_epoch = 5
n_units = 30        # number of units per layer
batchsize = 25     # minibatch size
n_label = 5         # number of labels
epoch_per_eval = 5  # number of epochs per evaluation
test = False

In [3]:
class SexpParser(object):

    def __init__(self, line):
        self.tokens = re.findall(r'\(|\)|[^\(\) ]+', line)
        self.pos = 0

    def parse(self):
        assert self.pos < len(self.tokens)
        token = self.tokens[self.pos]
        assert token != ')'
        self.pos += 1

        if token == '(':
            children = []
            while True:
                assert self.pos < len(self.tokens)
                if self.tokens[self.pos] == ')':
                    self.pos += 1
                    break
                else:
                    children.append(self.parse())
            return children
        else:
            return token

In [4]:
def convert_tree(vocab, exp):
    assert isinstance(exp, list) and (len(exp) == 2 or len(exp) == 3)

    if len(exp) == 2:
        label, leaf = exp
        if leaf not in vocab:
            vocab[leaf] = len(vocab)
        return {'label': int(label), 'node': vocab[leaf]}
    elif len(exp) == 3:
        label, left, right = exp
        node = (convert_tree(vocab, left), convert_tree(vocab, right))
        return {'label': int(label), 'node': node}

In [5]:
def read_corpus(path, vocab, max_size):
    with codecs.open(path, encoding='utf-8') as f:
        trees = []
        for line in f:
            line = line.strip()
            tree = SexpParser(line).parse()
            trees.append(convert_tree(vocab, tree))
            if max_size and len(trees) >= max_size:
                break

        return trees

In [44]:
# play ground
# train_trees = read_corpus('trees/train.txt', vocab, max_size)
with codecs.open('trees/train.txt',  encoding='utf-8') as f:
    trees = []
    for line in f:
        line = line.strip()
        print("line", line)
        tokens = re.findall(r'\(|\)|[^\(\) ]+', line)
        print("tokens", tokens)        
        pos = 0
        
        
        assert self.pos < len(self.tokens)
        token = self.tokens[self.pos]
        assert token != ')'
        self.pos += 1

        if token == '(':
            children = []
            while True:
                assert self.pos < len(self.tokens)
                if self.tokens[self.pos] == ')':
                    self.pos += 1
                    break
                else:
                    children.append(self.parse())
            return print("children", children)
        else:
            print("token", token)
        
        break

#re.findall(r'\(|\)|[^\(\) ]+', "( line , abc, def)")

line (3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .)))
tokens ['(', '3', '(', '2', '(', '2', 'The', ')', '(', '2', 'Rock', ')', ')', '(', '4', '(', '3', '(', '2', 'is', ')', '(', '4', '(', '2', 'destined', ')', '(', '2', '(', '2', '(', '2', '(', '2', '(', '2', 'to', ')', '(', '2', '(', '2', 'be', ')', '(', '2', '(', '2', 'the', ')', '(', '2', '(', '2', '21st', ')', '(', '2', '(', '2', '(', '2', 'Century', ')', '(', '2', "'s", ')', ')', '(', '2', '(', '3', 'new', ')', '(', '2', '(', '2', '``', ')', '(', '2', 'Conan', ')', ')', ')', ')', ')', ')', ')', ')', '(', '2', "''", ')', '

['(', ' ', ' ', ' ', ' ', ')']

In [45]:
class RecursiveNet(chainer.Chain):

    def __init__(self, n_vocab, n_units):
        super(RecursiveNet, self).__init__(
            embed=L.EmbedID(n_vocab, n_units),
            l=L.Linear(n_units * 2, n_units),
            w=L.Linear(n_units, n_label))

    def leaf(self, x):
        return self.embed(x)

    def node(self, left, right):
        return F.tanh(self.l(F.concat((left, right))))

    def label(self, v):
        return self.w(v)

In [46]:
def traverse(model, node, train=True, evaluate=None, root=True):
    if isinstance(node['node'], int):
        # leaf node
        word = xp.array([node['node']], np.int32)
        loss = 0
        x = chainer.Variable(word, volatile=not train)
        v = model.leaf(x)
    else:
        # internal node
        left_node, right_node = node['node']
        left_loss, left = traverse(
            model, left_node, train=train, evaluate=evaluate, root=False)
        right_loss, right = traverse(
            model, right_node, train=train, evaluate=evaluate, root=False)
        v = model.node(left, right)
        loss = left_loss + right_loss

    y = model.label(v)

    if train:
        label = xp.array([node['label']], np.int32)
        t = chainer.Variable(label, volatile=not train)
        loss += F.softmax_cross_entropy(y, t)

    if evaluate is not None:
        predict = cuda.to_cpu(y.data.argmax(1))
        if predict[0] == node['label']:
            evaluate['correct_node'] += 1
        evaluate['total_node'] += 1

        if root:
            if predict[0] == node['label']:
                evaluate['correct_root'] += 1
            evaluate['total_root'] += 1

    return loss, v

In [47]:
def evaluate(model, test_trees):
    m = model.copy()
    m.volatile = True
    result = collections.defaultdict(lambda: 0)
    for tree in test_trees:
        traverse(m, tree, train=False, evaluate=result)

    acc_node = 100.0 * result['correct_node'] / result['total_node']
    acc_root = 100.0 * result['correct_root'] / result['total_root']
    print(' Node accuracy: {0:.2f} %% ({1:,d}/{2:,d})'.format(
        acc_node, result['correct_node'], result['total_node']))
    print(' Root accuracy: {0:.2f} %% ({1:,d}/{2:,d})'.format(
        acc_root, result['correct_root'], result['total_root']))

In [48]:
vocab = {}
max_size = None
train_trees = read_corpus('trees/train.txt', vocab, max_size)
test_trees = read_corpus('trees/test.txt', vocab, max_size)
develop_trees = read_corpus('trees/dev.txt', vocab, max_size)

model = RecursiveNet(len(vocab), n_units)

In [49]:
# Setup optimizer
optimizer = optimizers.AdaGrad(lr=0.1)
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))

In [50]:
accum_loss = 0
count = 0
start_at = time.time()
cur_at = start_at

In [51]:
# play ground
# isinstance(train_trees[0]['node'], int)

train_trees[0]


{'label': 3,
 'node': ({'label': 2,
   'node': ({'label': 2, 'node': 0}, {'label': 2, 'node': 1})},
  {'label': 4,
   'node': ({'label': 3,
     'node': ({'label': 2, 'node': 2},
      {'label': 4,
       'node': ({'label': 2, 'node': 3},
        {'label': 2,
         'node': ({'label': 2,
           'node': ({'label': 2,
             'node': ({'label': 2,
               'node': ({'label': 2, 'node': 4},
                {'label': 2,
                 'node': ({'label': 2, 'node': 5},
                  {'label': 2,
                   'node': ({'label': 2, 'node': 6},
                    {'label': 2,
                     'node': ({'label': 2, 'node': 7},
                      {'label': 2,
                       'node': ({'label': 2,
                         'node': ({'label': 2, 'node': 8},
                          {'label': 2, 'node': 9})},
                        {'label': 2,
                         'node': ({'label': 3, 'node': 10},
                          {'label': 2,
            

In [26]:
for epoch in range(n_epoch):
    print('Epoch: {0:d}'.format(epoch))
    total_loss = 0
    cur_at = time.time()
    random.shuffle(train_trees)
    for tree in train_trees:
        loss, v = traverse(model, tree, train=True)
        accum_loss += loss
        count += 1

        if count >= batchsize:
            model.zerograds()
            accum_loss.backward()
            optimizer.update()
            total_loss += float(accum_loss.data)

            accum_loss = 0
            count = 0

    print('loss: {:.2f}'.format(total_loss))

    now = time.time()
    throuput = float(len(train_trees)) / (now - cur_at)
    print('{:.2f} iters/sec, {:.2f} sec'.format(throuput, now - cur_at))
    print()

    if (epoch + 1) % epoch_per_eval == 0:
        print('Train data evaluation:')
        evaluate(model, train_trees)
        print('Develop data evaluation:')
        evaluate(model, develop_trees)
        print('')

print('Test evaluateion')
evaluate(model, test_trees)

Epoch: 0
loss: 223760.15
5.46 iters/sec, 1563.50 sec

Epoch: 1
loss: 159683.95
7.01 iters/sec, 1219.45 sec

Epoch: 2
loss: 136041.59
6.60 iters/sec, 1293.73 sec

Epoch: 3
loss: 122566.20
8.40 iters/sec, 1017.21 sec

Epoch: 4
loss: 113496.38
7.75 iters/sec, 1102.12 sec

Train data evaluation:
 Node accuracy: 86.60 %% (275,885/318,582)
 Root accuracy: 53.97 %% (4,611/8,544)
Develop data evaluation:
 Node accuracy: 80.32 %% (33,291/41,447)
 Root accuracy: 42.96 %% (473/1,101)

Test evaluateion
 Node accuracy: 80.04 %% (66,109/82,600)
 Root accuracy: 41.36 %% (914/2,210)


In [54]:
from chainer import serializers

In [32]:
# serializers.save_npz("sentiment.model", model)

In [55]:
# load the model
load_model = RecursiveNet(len(vocab), n_units)
optimizer = optimizers.AdaGrad(lr=0.1)
optimizer.setup(load_model)
optimizer.add_hook(chainer.optimizer.WeightDecay(0.0001))
serializers.load_npz('sentiment.model',  load_model)

In [56]:
m = load_model.copy()
m.volatile = True
result = collections.defaultdict(lambda: 0)
train = False
result = collections.defaultdict(lambda: 0)

# traverse(m, train_trees[1], train=False, evaluate=result)
node = train_trees[2]
if isinstance(node['node'], int):
    # leaf node
    word = xp.array([node['node']], np.int32)
    loss = 0
    x = chainer.Variable(word, volatile=not train)
    v = load_model.leaf(x)
else:
    # internal node
    left_node, right_node = node['node']
    left_loss, left = traverse(
        load_model, left_node, train=train, evaluate=result, root=False)
    right_loss, right = traverse(
        load_model, right_node, train=train, evaluate=result, root=False)
    v = load_model.node(left, right)
    loss = left_loss + right_loss

In [57]:
result

defaultdict(<function __main__.<lambda>>,
            {'correct_node': 68, 'total_node': 76})

In [81]:
#train_trees[2]

word = xp.array([test_trees[0]['node'][0]['node'][0]['node']], np.int32)
loss = 0
x = chainer.Variable(word, volatile=not train)
v = load_model.leaf(x)
print(word)
print(x.data)
v.data

[6064]
[6064]


array([[ 0.32051429,  0.36132571,  0.04742519,  0.2816264 , -0.29773203,
        -0.01792988,  0.28714287,  0.14204991, -0.27863452,  0.05220072,
         0.27659449,  0.29560179,  0.1171884 ,  0.3331736 , -0.45720482,
         0.31917858,  0.28488305,  0.30823752, -0.29386976,  0.21543735,
        -0.31053305, -0.31920785,  0.3124949 , -0.18925038,  0.34076777,
         0.29246122,  0.32320318,  0.31434807,  0.18184373,  0.34562123]], dtype=float32)

6064