# Plumbing
1. Download and unpack `sentence polarity dataset v1.0` from http://www.cs.cornell.edu/people/pabo/movie-review-data/
2. Download BNC (TODO)
3. Download the EasyCCG parser from http://homepages.inf.ed.ac.uk/s1049478/easyccg.html, unpack the package (you should get a catalog like `easyccg-0.2`). From the same page, download the regular pretrained model (`model.tar.gz`). Unpack the model to the parser's catalog.

# Getting the British National Corpus & the word list

We will parse BNC XML files with lxml. NLTK technically has a dedicated parser for BNC, which is extremely slow in the lazy mode, and in the non-lazy mode it is very slow and also consumes >8GB of memory.

In [1]:
bnc_path = 'BNC/Texts/'
from os.path import exists

def bnc_files_iter():
    top_level = ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'K']
    symbols = top_level + ['L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'V', 'X', 'Y', 'Z',
                           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for top in top_level:
        top_path = bnc_path + '/' + top
        if not exists(top_path):
            continue
        for symbol2 in symbols:
            path2 = top_path + '/' + top + symbol2
            if not exists(path2):
                continue
            for symbol3 in symbols:
                current_path = path2 + '/' + top + symbol2 + symbol3 + '.xml'
                if not exists(current_path):
                    continue
                yield open(current_path)

In [2]:
from lxml import etree

In [3]:
unique_words = set()

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            unique_words.add(element.text.strip())
    bnc_file.close()
    
unique_words = list(unique_words)
print(unique_words[:10])

['', 'yuh', 'MINES', 'Rosaldo', 'UPJOHN', 'Ming/Qing', 'binned', '14-year-old', 'froings', 'not-so-prodigal']


In [4]:
unique_count = len(unique_words)
print(unique_count)

705241


# Getting CCG parse trees for BNC

In [5]:
# we will run the underlying parser with pexpect, and intercept its outputs from within Python
import pexpect
parser = pexpect.spawn('java -jar easyccg-0.2/easyccg.jar --model easyccg-0.2/model')
parser.expect('Model loaded, ready to parse.')
parser.send('The cat chases a ball of yarn.\n')
parser.expect('ID')
parser.expect('\n\(.*\n')
parser_output = parser.after.decode().strip() # encode from bytes into str, strip whitespace
print(parser_output)
parser.terminate()

(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<L N POS POS cat N>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP POS POS chases (S[dcl]\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS ball N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP POS POS of (NP\NP)/NP>) (<T NP 0 1> (<L N POS POS yarn. N>) ) ) ) ) )


False

Let's see how NLTK can handle parse trees.

In [6]:
import re
only_word = re.compile(r'<L\s\S+\sPOS\sPOS\s(\S+)\s\S+>')
concat_label = re.compile(r'<(\S+)\s(\S+)\s(\S+)\s(\S+)>')

# some string cleanup
def clean_parser_output(parse_output):
    return concat_label.sub(lambda match: '<'+match.group(1)+'_'+match.group(2).replace('(', '[').replace(')', ']')
                            +'_'+match.group(3)+'_'+match.group(4)+'>',
                            only_word.sub(lambda match: match.group(1), parse_output))

from nltk.tree import ParentedTree
tree = ParentedTree.fromstring(clean_parser_output(parser_output))
print(tree)

(<T_S[dcl]_1_2>
  (<T_NP[nb]_0_2> (The ) (cat ))
  (<T_S[dcl]\NP_0_2>
    (chases )
    (<T_NP[nb]_0_2>
      (<T_NP[nb]_0_2> (a ) (ball ))
      (<T_NP\NP_0_2> (of ) (<T_NP_0_1> (yarn. ))))))


In each `(parenthesized expression)`, the first item `(head)` is the category of node, and two next items are its child nodes.

## Learning word embeddings

Our embedding procedure will be based on this Tensorflow [word2vec tutorial](https://www.tensorflow.org/tutorials/word2vec).

In [7]:
# Consistently map each unique word to a integer.
word_map = { word: index for index, word in enumerate(unique_words) }

In [8]:
# Collect all sentences from the corpus, with words as their indices in the word map.
corpus_sents = []

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if element.tag == 's':
            corpus_sents.append([])
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            corpus_sents[-1].append(word_map[element.text.strip()])
    bnc_file.close()

In [15]:
#import gensim

In [11]:
#class Corpus_iter:
#    def __iter__(self):
#        for sent in corpus_sents:
#            yield [unique_words[wi] for wi in sent]

In [None]:
#corp_iter = Corpus_iter()
#w2v_model = gensim.models.Word2Vec(corp_iter)

Generate batches of pairs (context word, target word). For simplicity, we hardcode the window size (2) and number of examples in window.

In [9]:
import numpy as np

In [10]:
# TODO REMOVE

from random import randint
from math import floor

vocabulary_size = len(unique_words) + 1 # add the boundary token
embedding_size = 70
batch_size = 128
# Number of sample correct word pairs to be shown to word2vec for one random target word.
num_samples = 16
assert num_samples % 2 == 0
assert batch_size % num_samples == 0
# We need a special token for cases when the target word is near the start or end of sentence.
bound_token_id = vocabulary_size - 1

corp_runs = 10
sent_step = 1

def skipgram_batches():
    for run_n in range(corp_runs):
        sent_n = 0
        word_n = 0
        
        target_n = 0 # relative to the current batch
        
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        
        while sent_n < len(corpus_sents):
            for j in range(num_samples):
                labels[target_n*num_samples+j][0] = corpus_sents[sent_n][word_n]
            for j in range(num_samples // 2):
                batch[target_n*num_samples+j*2] = (corpus_sents[sent_n][word_n-j] if word_n-j >= 0
                                                   else bound_token_id)
                batch[target_n*num_samples+j*2+1] = (corpus_sents[sent_n][word_n+j]
                                                     if word_n+j < len(corpus_sents[sent_n])
                                                     else bound_token_id)
                
            target_n += 1
            if target_n == (batch_size // num_samples):
                yield batch, labels, False
                batch = np.ndarray(shape=(batch_size), dtype=np.int32)
                labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
                target_n = 0
                
            word_n += 1
            try:
                while word_n == len(corpus_sents[sent_n]):
                    word_n = 0
                    sent_n += sent_step
                    if (floor(sent_n / len(corpus_sents) * 10)
                        > floor((sent_n-sent_step) / len(corpus_sents) * 10)):
                        print('{}0%'.format(floor(sent_n / len(corpus_sents) * 10)), end=' ')
            except IndexError: # happens on the end of the corpus
                break
                
        batch[target_n:] = 0.0
        labels[target_n:, :] = 0.0
        yield batch, labels, (run_n == corp_runs - 1)

In [11]:
# TODO REMOVE
import tensorflow as tf
import math

In [12]:
with tf.Session() as sess:
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [33]:
# TODO REMOVE
tf.reset_default_graph()
with tf.device('/cpu:0'):
    # Model parameters: word embeddings and model weights & biases for each word.
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                                  stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [34]:
# TODO REMOVE
##with tf.device('/cpu:0'):

with tf.device('/cpu:0'):
    # The computation graph.
    inputs = tf.placeholder(tf.int32, shape=[batch_size])
    labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    embedding_layer = tf.nn.embedding_lookup(embeddings, inputs)
    # Note that word2vec has no "real" hidden layers apart from the embedding.
    
    # Number of random words to sample apart from the true target; the model should learn to
    # assign low probability to them given the context.
    negative_samples_n = batch_size * 2
    
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=labels,
                                         inputs=embedding_layer,
                                         num_sampled=negative_samples_n,
                                         num_classes=vocabulary_size))
    optimizer = tf.train.AdagradOptimizer(0.05).minimize(loss)

In [35]:
# TODO REMOVE
import datetime

trained_embeddings = [] # we want to use them later
with tf.Session() as sess:
    print('Training start:', datetime.datetime.now())
    tf.global_variables_initializer().run()
    i = 0
    for batch_inputs, batch_labels, is_last in skipgram_batches():
        if is_last:
            _, loss_val, trained_embeddings = sess.run([optimizer, loss, embeddings], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            print('Final loss:', loss_val)
            print('Training end:', datetime.datetime.now())
        else:
            _, loss_val = sess.run([optimizer, loss], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            if (i % 250000 == 0):
                print('(loss: {})'.format(loss_val), end=' ')
        i += 1

Training start: 2017-11-27 00:55:43.982109


(loss: 1339.1641845703125) (loss: 90.64764404296875) (loss: 0.3690961003303528) (loss: 1.606388807296753) (loss: 33.17181396484375) (loss: 4.64590311050415) 10% (loss: 36.2855224609375) (loss: 1.1286957263946533) (loss: 32.92365646362305) (loss: 71.0347900390625) (loss: 0.2758979797363281) 20% (loss: 4.5707807540893555) (loss: 4.296047687530518) (loss: 0.7888756990432739) (loss: 3.196495532989502) (loss: 46.637062072753906) (loss: 44.94758605957031) 30% (loss: 0.2709590494632721) (loss: 1.7694886922836304) (loss: 15.963029861450195) (loss: 6.522884368896484) (loss: 1.9111801385879517) 40% (loss: 8.41146183013916) (loss: 2.555312156677246) (loss: 0.2825905382633209) (loss: 0.7162767648696899) (loss: 0.1939745843410492) (loss: 7.535548210144043) 50% (loss: 0.24661734700202942) (loss: 3.205333948135376) (loss: 2.1553921699523926) (loss: 0.8445965051651001) 60% (loss: 257.6120910644531) (loss: 0.7380498647689819) (loss: 2.093625545501709) (loss: 13.904006958007812) (loss: 1.527993440628051

(loss: 0.8147010803222656) (loss: 2.940011978149414) (loss: 2.010826349258423) 10% (loss: 4.737893581390381) (loss: 2.0707268714904785) (loss: 0.30549025535583496) (loss: 1.4315439462661743) (loss: 2.1357955932617188) (loss: 3.7092084884643555) 20% (loss: 1.5353803634643555) (loss: 0.7573847770690918) (loss: 0.7351540923118591) (loss: 0.18051570653915405) (loss: 0.8986968994140625) 30% (loss: 0.9448647499084473) (loss: 0.6661956906318665) (loss: 1.36269211769104) (loss: 0.8184607625007629) (loss: 6.10341215133667) (loss: 2.1489791870117188) 40% (loss: 3.8815298080444336) (loss: 1.1165080070495605) (loss: 1.6206552982330322) (loss: 0.706818699836731) (loss: 1.1072916984558105) 50% (loss: 6.321184158325195) (loss: 0.9157577753067017) (loss: 0.3018455505371094) (loss: 0.6180638074874878) (loss: 1.3209691047668457) 60% (loss: 1.3155345916748047) (loss: 3.553065776824951) (loss: 0.7651028633117676) (loss: 1.2417901754379272) (loss: 0.5090122222900391) 70% (loss: 1.9688977003097534) (loss: 1

In [57]:
def nearest_neighbor(word):
    dists = np.abs(trained_embeddings - trained_embeddings[word_map[word], ]).sum(axis=1)
    dists[word_map[word]] = 1e6
    return unique_words[dists.argmin(axis=0)]

In [55]:
def nearest_cos_neighbor(word):
    dists = (np.dot(trained_embeddings, trained_embeddings[word_map[word],])
             / np.linalg.norm(trained_embeddings) * np.linalg.norm(trained_embeddings[word_map[word],]))
    dists[word_map[word]] = 1e6
    return unique_words[dists.argmin(axis=0)]

In [58]:
print('Nearest word vectors for:')
print('cat:', nearest_neighbor('cat'))
print('doctor:', nearest_neighbor('doctor'))
print('cold:', nearest_neighbor('cold'))
print('blue:', nearest_neighbor('blue'))
print('red:', nearest_neighbor('red'))
print('walk:', nearest_neighbor('walk'))
print('bring:', nearest_neighbor('bring'))
print('is:', nearest_neighbor('is'))
print('Europe:', nearest_neighbor('europe'))

Nearest word vectors for:
cat: Topstock
doctor: five-thirty
cold: firemen
blue: Raisonnable
red: Fairies
walk: books
bring: case-study
is: this
Europe: Ugborough


In [59]:
print('Nearest word vectors for:')
print('cat:', nearest_cos_neighbor('cat'))
print('doctor:', nearest_cos_neighbor('doctor'))
print('cold:', nearest_cos_neighbor('cold'))
print('blue:', nearest_cos_neighbor('blue'))
print('red:', nearest_cos_neighbor('red'))
print('walk:', nearest_cos_neighbor('walk'))
print('bring:', nearest_cos_neighbor('bring'))
print('is:', nearest_cos_neighbor('is'))
print('Europe:', nearest_cos_neighbor('europe'))

Nearest word vectors for:
cat: facetiae
doctor: SPATULAS/PALETTE
cold: Pre-Windscale
blue: Villers-aux-Noeuds
red: Lächerlich
walk: LIND
bring: Bestowed
is: FAMILY-ORIENTED
Europe: Somerville-1941


## Learning the transformation matrix

In [60]:
import numpy as np
import torch
from torch.autograd import Variable

In [61]:
enc_W = Variable(torch.randn(embedding_size*2, embedding_size), requires_grad=True)
enc_b = Variable(torch.zeros(1, embedding_size), requires_grad=True)
dec_W = Variable(torch.randn(embedding_size, embedding_size*2), requires_grad=True)
dec_b = Variable(torch.zeros(1, embedding_size*2), requires_grad=True)

In [62]:
def encode_node(child1, child2):
    #"""Both child1 and child2 are numpy arrays of shape (1, embedding_size). Return the encoding
    #(1, embedding_size)."""
    conc_embeds = Variable(torch.cat((child1.data, child2.data), 0))
    # we use.view() because we need to make sure that the return value is a vector (as word embeddings),
    # not a matrix
    return conc_embeds.matmul(enc_W).add(enc_b).tanh().view(embedding_size)

def decode_node(node):
    # node is (1, embedding_size), output is (1, 2*embedding_size)
    return node.matmul(dec_W).add(dec_b).tanh().view(embedding_size*2)

In [63]:
from functools import reduce
from random import choice, randint
encoding_train_batch_size = 50 # number of sentences

# Handle special treatment of parens by our parser.
def nd_lbl(node):
    if node.label() == '-LRB-':
        return '('
    elif node.label() == '-RRB-':
        return ')'
    else:
        return node.label()

# Note that node_encodings are passed by value, so we always modify the dictionary given to
# the topmost function call.
def encode_tree(node, node_encodings):
    "Encode_tree returns a pair of lists of partial derivatives for encoding matrix and bias"
    subtrees = [subtr for subtr in node]
    if len(subtrees) == 0: # a leaf
        if nd_lbl(node) in word_map:
            node_encodings[nd_lbl(node)] = Variable(
                torch.from_numpy(trained_embeddings[word_map[nd_lbl(node)], ]))
        else: # replace unknowns with a random word
            node_encodings[nd_lbl(node)] = Variable(
                torch.from_numpy(trained_embeddings[randint(0, trained_embeddings.shape[0]), ]))
    elif len(subtrees) == 1:
        encode_tree(subtrees[0], node_encodings)
        node_encodings[nd_lbl(node)] = node_encodings[nd_lbl(subtrees[0])]
    else:
        if len(subtrees) != 2: # dbg
            print(subtrees)
        encode_tree(subtrees[0], node_encodings)
        encode_tree(subtrees[1], node_encodings)
        node_encodings[nd_lbl(node)] = encode_node(
            node_encodings[nd_lbl(subtrees[0])],
            node_encodings[nd_lbl(subtrees[1])])

def make_parser():
    parser = pexpect.spawn('java -jar easyccg-0.2/easyccg.jar --model easyccg-0.2/model')
    parser.expect('Model loaded, ready to parse.')
    return parser

def kill_parser(parser):
    parser.terminate()
    
def sentence_tree(sentence_form, parser):
    parser.send(sentence_form+'\n')
    # (this secures us from finding one of the patterns below in the sentence itself:)
    response = parser.expect([pexpect.TIMEOUT, 'ID'])
    if response == 1: # can't happen if timed out
        response = parser.expect(['Skipping sentence of length', '\n\(.*\n', pexpect.TIMEOUT])
    if response in [0, 2]:
        return False
    parser_output = parser.after.decode().strip() # encode from bytes into str, strip whitespace
    return ParentedTree.fromstring(clean_parser_output(parser_output))

In [64]:
iters_n = 25
encoding_train_batch_size = 50
learning_rate = 0.1

for iter_i in range(iters_n):
    used_sents = [] # at least don't repeat them in one batch
    batch_accum_error = 0
    parser = make_parser()
    nodes_n = 0 # count them to average the error
    
    for i in range(encoding_train_batch_size):
        tree = False
        # It's possible that sentence_tree() returns False, if the sentence was too long and
        # rejected by the parser, or it timeouts.
        while not tree:
            sentence_n = randint(0, len(corpus_sents))
            while sentence_n in used_sents:
                sentence_n = randint(0, len(corpus_sents))
            sentence = corpus_sents[sentence_n]
            used_sents.append(sentence_n)
            
            sentence_form = ' '.join([unique_words[word_id] for word_id in sentence])
            #print(sentence_n, sentence_form)
            tree = sentence_tree(sentence_form, parser)

        # Encode the tree.
        node_encodings = dict()
        encode_tree(tree, node_encodings)
        
        # Decode the tree back again.
        # this dictionary in fact maps nodes to their *partial* decodings from which their children are to be
        # recreated; thus for the root it's just its encoding, from which we will retrieve immediate children
        node_decodings = dict()
        node_decodings[nd_lbl(tree.root())] = node_encodings[nd_lbl(tree.root())]
        nodes_to_visit = [ tree.root() ]
        while nodes_to_visit:
            current_node = nodes_to_visit.pop()
            children = [child for child in current_node]
            if len(children) == 0:
                continue
            elif len(children) == 2: # not a leaf
                decoded_node = decode_node(node_decodings[nd_lbl(current_node)])
                node_decodings[nd_lbl(children[0])] = decoded_node[:embedding_size]
                node_decodings[nd_lbl(children[1])] = decoded_node[embedding_size:]
                
                #print(node_encodings[nd_lbl(current_node)])
                #print(node_decodings[nd_lbl(current_node)])
                err = node_encodings[nd_lbl(current_node)].sub(node_decodings[nd_lbl(current_node)]).abs().sum()
                err.backward() # accumulate gradient
                batch_accum_error += err.data
                nodes_n += 1
            else:
                raise RuntimeError('unexpected number of node children in decode:' + str(children))
        
    kill_parser(parser)
    print('Batch', iter_i+1, 'error: ', (batch_accum_error / nodes_n)[0])
    if batch_accum_error[0] == 0:
        for sentence_n in used_sents:
            print(' '.join([unique_words[word_id] for word_id in corpus_sents[sentence_n]]))
        raise RuntimeError
    enc_W.data -= enc_W.grad.data * learning_rate
    enc_b.data -= enc_b.grad.data * learning_rate
    dec_W.data -= dec_W.grad.data * learning_rate
    dec_b.data -= dec_b.grad.data * learning_rate
    #print('ENC_W', enc_W.grad, 'ENC_B', enc_b.grad, 'DEC_W', dec_W.grad, 'DEC_B', dec_b.grad)
    
    enc_W.grad.data.zero_()
    enc_b.grad.data.zero_()
    dec_W.grad.data.zero_()
    dec_b.grad.data.zero_()

Batch 1 error:  14.511988639831543
Batch 2 error:  15.387177467346191
Batch 3 error:  15.15761947631836
Batch 4 error:  10.777464866638184
Batch 5 error:  10.480030059814453
Batch 6 error:  12.86169719696045
Batch 7 error:  5.273446083068848
Batch 8 error:  6.528011322021484
Batch 9 error:  4.766279220581055
Batch 10 error:  3.2289998531341553
Batch 11 error:  3.494234085083008
Batch 12 error:  2.4390478134155273
Batch 13 error:  2.455670118331909
Batch 14 error:  3.6540942192077637
Batch 15 error:  1.608689785003662
Batch 16 error:  1.2823563814163208
Batch 17 error:  0.6695910692214966
Batch 18 error:  0.6050992012023926
Batch 19 error:  1.7388299703598022
Batch 20 error:  2.086718797683716
Batch 21 error:  1.6035962104797363
Batch 22 error:  0.6890740394592285
Batch 23 error:  1.0876528024673462
Batch 24 error:  0.924579918384552
Batch 25 error:  0.6923951506614685


# Training a sentiment analysis model

In [65]:
import nltk
nltk.download('sentence_polarity')

[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /home/szymon/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!


True

In [66]:
from nltk.corpus import sentence_polarity
print(sentence_polarity.sents(categories='pos')[:3])

[['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', "century's", 'new', '"', 'conan', '"', 'and', 'that', "he's", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'], ['the', 'gorgeously', 'elaborate', 'continuation', 'of', '"', 'the', 'lord', 'of', 'the', 'rings', '"', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co-writer/director', 'peter', "jackson's", 'expanded', 'vision', 'of', 'j', '.', 'r', '.', 'r', '.', "tolkien's", 'middle-earth', '.'], ['effective', 'but', 'too-tepid', 'biopic']]


Now we will split the sentence polarity corpus into test and training slices in proportion 10/90, just as in the paper.

In [67]:
from random import sample
assert len(sentence_polarity.sents(categories='pos')) == len(sentence_polarity.sents(categories='neg'))

In [68]:
train_pnt, test_pnt = 0, 0
test_corp_len = (len(sentence_polarity.sents(categories='pos')) // 10
                 + len(sentence_polarity.sents(categories='neg')) // 10)
sent_pol_len = len(sentence_polarity.sents())

train_sent_vecs = np.zeros((sent_pol_len - test_corp_len, embedding_size))
test_sent_vecs = np.zeros((test_corp_len, embedding_size))
train_sent_labels = np.zeros((sent_pol_len - test_corp_len, 1))
test_sent_labels = np.zeros((test_corp_len, 1))

parser = make_parser()

print('Parsing start:', datetime.datetime.now())
for (label, sents) in [(1.0, sentence_polarity.sents(categories='pos')),
                      (0.0, sentence_polarity.sents(categories='neg'))]:
    sents = list(sents)
    test_ids = sample(range(len(sents)), len(sents) // 10)
    for sent_i in range(len(sents)):
        tree = sentence_tree(' '.join(sents[sent_i]), parser)
        if not tree: # sentence too long, or times out the parser
            continue
        node_encodings = dict()
        encode_tree(tree, node_encodings)
        if sent_i in test_ids:
            #print(node_encodings[nd_lbl(tree.root())])
            test_sent_vecs[test_pnt, :] = node_encodings[nd_lbl(tree.root())].data.numpy()
            test_sent_labels[test_pnt, 0] = label
            test_pnt += 1
        else:
            #print(node_encodings[nd_lbl(tree.root())])
            train_sent_vecs[train_pnt, :] = node_encodings[nd_lbl(tree.root())].data.numpy()
            train_sent_labels[train_pnt, 0] = label
            train_pnt += 1
kill_parser(parser)
print('Parsing end:', datetime.datetime.now())

Parsing start: 2017-11-29 12:44:05.242768


EOF: End Of File (EOF). Exception style platform.
<pexpect.pty_spawn.spawn object at 0x7f0be0060b00>
command: /usr/bin/java
args: ['/usr/bin/java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model']
buffer (last 100 chars): b''
before (last 100 chars): b" blockbuster before midnight , you're going to face frightening late fees . o . k . , not really .\r\n"
after: <class 'pexpect.exceptions.EOF'>
match: None
match_index: None
exitstatus: None
flag_eof: True
pid: 16175
child_fd: 77
closed: False
timeout: 30
delimiter: <class 'pexpect.exceptions.EOF'>
logfile: None
logfile_read: None
logfile_send: None
maxread: 2000
ignorecase: False
searchwindowsize: None
delaybeforesend: 0.05
delayafterclose: 0.1
delayafterterminate: 0.1
searcher: searcher_re:
    0: TIMEOUT
    1: re.compile("b'ID'")

## Logistic regression

The original paper seems to use just a binary classifier of sentence vectors, without any neural net hidden layers. It is the approach we will try first.

In [30]:
regr_weights = tf.Variable(tf.random_normal([embedding_size, 1]))
regr_bias = tf.Variable(tf.random_normal([1, 1]))

X = tf.placeholder("float", [None, embedding_size], name='samples')
Y = tf.placeholder("float", [None, 1], name='labels')

prediction = tf.sigmoid(tf.matmul(X, regr_weights) + regr_bias)
loss = tf.losses.sigmoid_cross_entropy(prediction, Y)
optimizer = tf.train.AdagradOptimizer(learning_rate=0.5).minimize(loss)

acc = tf.metrics.accuracy(tf.round(prediction), tf.round(Y))

INFO:tensorflow:logits.dtype=<dtype: 'float32'>.
INFO:tensorflow:multi_class_labels.dtype=<dtype: 'float32'>.
INFO:tensorflow:losses.dtype=<dtype: 'float32'>.


In [31]:
epochs = 20001
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer()) # needed for accuracy metric
    for epoch_i in range(epochs):
        _, curr_loss = sess.run([optimizer, loss],
                                feed_dict={X: train_sent_vecs,
                                           Y: train_sent_labels})
        if epoch_i % 1000 == 0:
            print(epoch_i, curr_loss)
    
    _, test_acc = sess.run([loss, acc], feed_dict={X: test_sent_vecs,
                                          Y: test_sent_labels})
    print('Test accuracy', test_acc)

0 0.910725
1000 0.506159
2000 0.506093
3000 0.506075
4000 0.506053
5000 0.505926
6000 0.50588
7000 0.505871
8000 0.505868
9000 0.505865
10000 0.505864
11000 0.505863
12000 0.505862
13000 0.505862
14000 0.505861
15000 0.505861
16000 0.505861
17000 0.505861
18000 0.50586
19000 0.50586
20000 0.50586
Test accuracy (0.0, 0.48592871)


### TODO
Try to do an autoencoder without loops: https://groups.google.com/forum/#!topic/theano-users/O5CM49-jMqQ