# Plumbing
1. Download the phrase similarity dataset from http://homepages.inf.ed.ac.uk/mlap/resources/index.html, save as `phrase_similarities.txt`
2. Download the EasyCCG parser from http://homepages.inf.ed.ac.uk/s1049478/easyccg.html, unpack the package (you should get a catalog like `easyccg-0.2`). From the same page, download the regular pretrained model (`model.tar.gz`). Unpack the model to the parser's catalog.

# Getting the British National Corpus & the word list

We will parse BNC XML files with lxml. NLTK technically has a dedicated parser for BNC, which is extremely slow in the lazy mode, and in the non-lazy mode it is very slow and also consumes >8GB of memory.

In [1]:
bnc_path = 'BNC/Texts/'
from os.path import exists

def bnc_files_iter():
    top_level = ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'K']
    symbols = top_level + ['L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'V', 'X', 'Y', 'Z',
                           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for top in top_level:
        top_path = bnc_path + '/' + top
        if not exists(top_path):
            continue
        for symbol2 in symbols:
            path2 = top_path + '/' + top + symbol2
            if not exists(path2):
                continue
            for symbol3 in symbols:
                current_path = path2 + '/' + top + symbol2 + symbol3 + '.xml'
                if not exists(current_path):
                    continue
                yield open(current_path)

In [2]:
from lxml import etree

In [3]:
unique_words = set()

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            unique_words.add(element.text.strip())
    bnc_file.close()
    
unique_words = list(unique_words)
print(unique_words[:10])

['sportiness', '', 'dubiousness', 'sunnily', 'ink-exercise', 'Monteriggioni', 'gastrc', 'womb-water', 'W/100/26/1015', 'hierophants']


In [4]:
unique_count = len(unique_words)
print(unique_count)

705241


In [20]:
# try stemming just for the embedding?
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
stemmed_words = [stemmer.stem(word) for word in unique_words]
stemmed_words = list(set(stemmed_words))
print(len(stemmed_words))

497225


# Getting CCG parse trees for BNC

In [5]:
# we will run the underlying parser as a subprocess, and intercept its outputs from within Python
from subprocess import Popen, PIPE, STDOUT
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
# .encode() gives bytes instead of str, as .communicate() requires. We get a pair (stdout, stderr):
(parse, err) = p.communicate(input='The cat chases a ball of yarn.\n'.encode())
print(parse, '\n', err)
p.terminate()

b'ID=1\n(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<L N POS POS cat N>) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/NP POS POS chases (S[dcl]\\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS ball N>) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS of (NP\\NP)/NP>) (<T NP 0 1> (<L N POS POS yarn. N>) ) ) ) ) ) \n' 
 b'Loading model...\nModel loaded, ready to parse.\n'


Let's see how NLTK can handle parse trees.

In [6]:
import re
only_word = re.compile(r'<L\s\S+\sPOS\sPOS\s(\S+)\s\S+>')
concat_label = re.compile(r'<(\S+)\s(\S+)\s(\S+)\s(\S+)>')

# some string cleanup
def clean_parse_output(parse_output):
    # (remember we have to deal with the parse returned as bytes, not a Unicode string)
    lines = parse_output.decode('utf-8').split('\n')
    parse = ''
    if len(lines) > 1:
        parse = lines[1] # the second line contains the parse itself
    else:
        parse = lines[0]
    parse = concat_label.sub(lambda match: '<'+match.group(1)+'_'+match.group(2).replace('(', '[').replace(')', ']')
                             +'_'+match.group(3)+'_'+match.group(4)+'>',
                             only_word.sub(lambda match: match.group(1), parse))
    return parse

from nltk.tree import ParentedTree
tree = ParentedTree.fromstring(clean_parse_output(parse))
print(tree)

(<T_S[dcl]_1_2>
  (<T_NP[nb]_0_2> (The ) (cat ))
  (<T_S[dcl]\NP_0_2>
    (chases )
    (<T_NP[nb]_0_2>
      (<T_NP[nb]_0_2> (a ) (ball ))
      (<T_NP\NP_0_2> (of ) (<T_NP_0_1> (yarn. ))))))


In each `(parenthesized expression)`, the first item `(head)` is the category of node, and two next items are its child nodes.

In [7]:
def traverse(tree):
    for node in tree:
        return

In [8]:
trees = []
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if element.tag == 's':
            sentence = ''
            for nested_element in element.iter():
                if (nested_element.tag == 'w' or nested_element.tag == 'c') and nested_element.text:
                    sentence += ' ' + nested_element.text
            parse_output = p.communicate(input=sentence.encode())[0]
            p.terminate()
            trees.append(clean_parse_output(parse_output))
            p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
    bnc_file.close()
    if len(trees) >= 3:
        break
   
print(trees[:3])
p.terminate()

KeyboardInterrupt: 

## Learning word embeddings

Our embedding procedure will be based on this Tensorflow [word2vec tutorial](https://www.tensorflow.org/tutorials/word2vec).

In [9]:
# Consistently map each unique word to a integer.
word_map = {word: index for index, word in enumerate(unique_words) }

In [10]:
# Collect all sentences from the corpus, with words as their indices in the word map.
corpus_sents = []

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if element.tag == 's':
            corpus_sents.append([])
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            corpus_sents[-1].append(word_map[element.text.strip()])
    bnc_file.close()

Generate batches of pairs (context word, target word). For simplicity, we hardcode the window size (2) and number of examples in window.

In [11]:
import numpy as np

In [12]:
from random import randint

num_samples = 4

def skipgrams_batch(batch_size):
    assert batch_size % num_samples == 0
    windows_n = batch_size // num_samples
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    for i in range(windows_n):
        target_sent = randint(0, len(corpus_sents)-1)
        while len(corpus_sents[target_sent]) < 5:
            target_sent = randint(0, len(corpus_sents)-1)
        target = randint(2, len(corpus_sents[target_sent])-3)
        for j in range(num_samples):
            labels[i*num_samples+j][0] = corpus_sents[target_sent][target]
        batch[i*num_samples] = corpus_sents[target_sent][target-2]
        batch[i*num_samples+1] = corpus_sents[target_sent][target-1]
        batch[i*num_samples+2] = corpus_sents[target_sent][target+1]
        batch[i*num_samples+3] = corpus_sents[target_sent][target+2]
        
    return batch, labels

print(skipgrams_batch(12))

(array([304888, 233474, 648799, 416561, 115724,  20754, 615503, 302849,
        24156, 200840, 113164,  55699], dtype=int32), array([[379821],
       [379821],
       [379821],
       [379821],
       [442379],
       [442379],
       [442379],
       [442379],
       [213717],
       [213717],
       [213717],
       [213717]], dtype=int32))


In [13]:
import tensorflow as tf
import math

In [14]:
vocabulary_size = len(unique_words)
embedding_size = 70

# Model parameters.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                              stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [15]:
batch_size = 128

# The computation graph.
inputs = tf.placeholder(tf.int32, shape=[batch_size])
labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
embedding_layer = tf.nn.embedding_lookup(embeddings, inputs)

# Note that word2vec has no "real" hidden layers apart from the embedding.

# Number of random words to sample apart from the true target; the model should learn to
# assign low probability to them given the context.
negative_samples_n = 64

loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=labels,
                                     inputs=embedding_layer,
                                     num_sampled=negative_samples_n,
                                     num_classes=vocabulary_size))
optimizer = tf.train.AdagradOptimizer(0.1).minimize(loss)

In [16]:
steps_n = len(unique_words) * 4
trained_embeddings = [] # we want to use them later
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    for i in range(steps_n):
        batch_inputs, batch_labels = skipgrams_batch(batch_size)
        if i+1 == steps_n:
            _, loss_val, trained_embeddings = sess.run([optimizer, loss, embeddings], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            print('Final loss:', loss_val)
        else:
            _, loss_val = sess.run([optimizer, loss], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            # TODO meaningful completion info
            if (i % 100000 == 0):
                print(loss_val)
print(trained_embeddings.shape)

396.376
15.836
9.73131
11.2018
5.75977
5.79201
3.78162
3.32327
10.6314
3.88557
5.53605
5.37551
3.87442
2.33724
8.00938
1.67723
5.31973
1.50314
1.95304
1.49748
3.64755
4.37122
0.300876
2.90348
0.685659
2.57276
1.28728
2.5585
1.25169
Final loss: 1.0839
(705241, 70)


In [17]:
trained_embeddings[word_map['honey'], ]

array([ 1.8765161 ,  0.57190591,  1.14589   ,  1.88508892, -1.5483526 ,
        1.51270044,  1.29161346,  0.75599438,  0.84486055,  1.54948008,
        1.92075813, -1.3215512 , -0.58348435,  1.9025389 ,  0.55134082,
       -1.90189433,  1.53919566, -1.0111059 ,  0.41306138,  1.42498946,
        2.08634377,  1.70511055, -1.52415645, -0.3386088 ,  0.3850292 ,
        1.91870797,  2.03005552, -1.43424416, -0.80985343,  1.34230816,
       -1.71646845,  0.41091928,  1.55531728,  1.82881594,  1.46796107,
       -2.05305195, -1.19118345, -0.24090868, -1.97027183, -1.65812576,
       -2.19296861,  1.74225688, -0.57798874, -1.58367789,  1.09224331,
        0.31635639, -2.11186266, -1.70546985, -1.33731949,  1.00421917,
       -1.50437617, -2.06367993, -1.35685956,  0.47820225,  1.70919514,
       -1.72203338, -0.53100717,  1.14944351,  1.59701717,  0.73824239,
       -1.13850331,  1.06597984, -1.55652785, -0.36274669, -1.49337304,
       -1.21382844, -2.13838172, -1.89023709, -0.71068662, -1.13

## Learning the transformation matrix

In [18]:
import numpy as np
from scipy.optimize import fmin_l_bfgs_b as l_bfgs

In [19]:
#encoding_matrix = np.random.randn(embedding_size*2, embedding_size)
#encoding_bias = np.zeros((1, embedding_size))
#decoding_matrix = np.random.randn(embedding_size, embedding_size*2)
#decoding_bias = np.zeros((1, embedding_size*2))
init_enc_theta = np.random.randn(4, embedding_size, embedding_size*2)
enc_theta_shape = (4, embedding_size, embedding_size*2)

# Here the parameters need to passed as function arguments, because they will be optimized
# by the LBFGS implementation.
def encode_node(child1, child2, enc_theta):
    #"""Both child1 and child2 are numpy arrays of shape (1, embedding_size). Return the encoding
    #(1, embedding_size) and partial derivatives for enc_mat and enc_bias."""
    conc_embeds = np.concatenate((child1, child2))
    linear = np.dot(conc_embeds, np.transpose(enc_theta[0, :, :])) + enc_theta[1, :, 0] # enc_mat, enc_bias
    d_tanh = 1 - np.tanh(linear) ** 2
    d_mat = np.dot(np.reshape(conc_embeds, (conc_embeds.shape[0], 1)), np.reshape(d_tanh, (1, d_tanh.shape[0])))
    return np.tanh(linear), d_mat, d_tanh

def decode_node(node, enc_theta):
    # node is (1, embedding_size), output is (1, 2*embedding_size)
    linear = np.dot(node, enc_theta[2, :, :]) + enc_theta[3, 0, :] # dec_mat, dec_bias
    d_tanh = 1 - np.tanh(linear) ** 2
    d_mat = np.dot(np.reshape(node, (node.shape[0], 1)), np.reshape(d_tanh, (1, d_tanh.shape[0])))
    return np.tanh(linear), d_mat, d_tanh

In [24]:
from functools import reduce
encoding_train_batch_size = 5 # number of sentences

batch_d_enc_mat = np.zeros((embedding_size*2, embedding_size))
batch_d_enc_bias = np.zeros((1, embedding_size))
batch_d_dec_mat = np.zeros((embedding_size, embedding_size*2))
batch_d_dec_bias = np.zeros((1, embedding_size*2))
def encoding_train_batch(enc_theta):
    used_sents = [] # at least don't repeat them in one batch
    batch_error = np.zeros((1, embedding_size))
    # Partial derivatives -- to be stacked into a gradient at the end.
    global batch_d_enc_mat, batch_d_enc_bias, batch_d_dec_mat, batch_d_dec_bias
    batch_d_enc_mat = np.zeros((embedding_size*2, embedding_size))
    batch_d_enc_bias = np.zeros((1, embedding_size))
    batch_d_dec_mat = np.zeros((embedding_size, embedding_size*2))
    batch_d_dec_bias = np.zeros((1, embedding_size*2))
    # this is used to scale down the derivatives, ie. averaging
    nodes_n = 0
    enc_theta = np.reshape(enc_theta, enc_theta_shape)
    print(enc_theta.shape)
    
    for i in range(encoding_train_batch_size):
        sentence_n = randint(0, len(corpus_sents))
        while sentence_n in used_sents:
            sentence_n = randint(0, len(corpus_sents))
        sentence = corpus_sents[sentence_n]
        used_sents.append(sentence_n)
        
        p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
        #print([unique_words[word_id] for word_id in sentence])
        parse_output = p.communicate(input=' '.join([unique_words[word_id] for word_id in sentence]).encode())[0]
        p.terminate()
        
        # Collect partial derivatives for this sentence; in fact later they will have to be chained
        # when we will know the error (becaus it uses absolute value).
        #sent_d_enc_mat = {}
        #sent_d_enc_bias = {}
        
        # Handle special treatment of parens by our parser.
        def nd_lbl(node):
            if node.label() == '-LRB-':
                return '('
            elif node.label() == '-RRB-':
                return ')'
            else:
                return node.label()
        
        # Encode the tree.
        node_encodings = dict()
        #print(clean_parse_output(parse_output))
        tree = ParentedTree.fromstring(clean_parse_output(parse_output))
        def encode_tree(node):
            subtrees = [subtr for subtr in node]
            if len(subtrees) == 0: # a leaf
                node_encodings[nd_lbl(node)] = trained_embeddings[word_map[nd_lbl(node)], ]
            elif len(subtrees) == 1:
                encode_tree(subtrees[0])
                node_encodings[nd_lbl(node)] = node_encodings[nd_lbl(subtrees[0])]
            else:
                if len(subtrees) != 2: # dbg
                    print(subtrees)
                encode_tree(subtrees[0])
                encode_tree(subtrees[1])
                node_encodings[nd_lbl(node)], d_enc_mat, d_enc_bias = encode_node(
                    node_encodings[nd_lbl(subtrees[0])],
                    node_encodings[nd_lbl(subtrees[1])],
                    enc_theta)
                global batch_d_enc_mat, batch_d_enc_bias 
                batch_d_enc_mat = batch_d_enc_mat + d_enc_mat
                batch_d_enc_bias = batch_d_enc_bias + d_enc_bias
        encode_tree(tree)
        
        # Decode the tree back again.
        # this dictionary in fact maps nodes to their *partial* decodings from which their children are to be
        # recreated; thus for the root it's just its encoding, from which we will retrieve immediate children
        node_decodings = dict()
        node_decodings[nd_lbl(tree.root())] = node_encodings[nd_lbl(tree.root())]
        encoding_errors = dict()
        nodes_to_visit = [ tree.root() ]
        while nodes_to_visit:
            current_node = nodes_to_visit.pop()
            children = [child for child in current_node]
            if len(children) > 0: # not a leaf
                decoded_node, d_dec_mat, d_dec_bias = decode_node(node_decodings[nd_lbl(current_node)],
                                                                  enc_theta)
                node_decodings[nd_lbl(children[0])] = decoded_node[:embedding_size]
                node_decodings[nd_lbl(children[1])] = decoded_node[embedding_size:]
                # Get the error and partial derivatives (both (1, 2*embedding_size)).
                encoding_errors[nd_lbl(current_node)] = (node_encodings[nd_lbl(current_node)]
                                                         - node_decodings[nd_lbl(current_node)])
                batch_d_dec_mat = batch_d_dec_mat - d_dec_mat # dec is the minuend, so its part dev is -1
                batch_d_dec_bias = batch_d_dec_bias - d_dec_bias
                # np.abs()
                #d_encoding = ((node_encodings[current_node] - node_decodings[current_node])
                #              / encoding_errors[current_node])
                #d_decoding = ((node_decodings[current_node] - node_encodings[current_node])
                #              / encoding_errors[current_node])
                #batch_d_enc_mat = batch_d_enc_mat + np.dot(d_encoding, sent_d_enc_mat[current_node])
                #batch_d_enc_bias = batch_d_enc_bias + np.dot(d_encoding, sent_d_enc_bias[current_node])
                nodes_n += 1
        
        # Compute the error value.
        sent_error = reduce(lambda err_sum, node_err: err_sum + node_err, encoding_errors.values(), np.zeros((1, embedding_size)))
        sent_error = sent_error / len(encoding_errors)
        # Update batch error.
        batch_error = batch_error + sent_error / encoding_train_batch_size

    # TODO regularization    
    batch_gradient = np.zeros((4, embedding_size, embedding_size*2))
    batch_gradient[0, :, :] = np.transpose(batch_d_enc_mat) / nodes_n
    batch_gradient[1, :, 0] = batch_d_enc_bias / nodes_n
    batch_gradient[2, :, :] = batch_d_dec_mat / nodes_n
    batch_gradient[3, 0, :] = batch_d_dec_bias / nodes_n
    #[x / nodes_n for x in [batch_d_enc_mat, batch_d_enc_bias,
    #                                        batch_d_dec_mat, batch_d_dec_bias]]
    return batch_error, np.reshape(batch_gradient, (batch_gradient.size,)) # we need to return a 1d array

In [25]:
l_bfgs(encoding_train_batch, init_enc_theta)

(4, 70, 140)
(4, 70, 140)
(4, 70, 140)
(4, 70, 140)
(4, 70, 140)
(4, 70, 140)
(4, 70, 140)
(4, 70, 140)
(4, 70, 140)


(array([ 0.22748692, -0.95984159,  0.25619557, ...,  0.30072599,
        -0.48800971,  2.42655409]),
 array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.]]),
 {'funcalls': 9,
  'grad': array([ 0.55682554,  0.95166548,  1.62743453, ...,  0.        ,
          0.        ,  0.        ]),
  'nit': 1,
  'task': b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH',
  'warnflag': 0})