# Plumbing
1. Download the phrase similarity dataset from http://homepages.inf.ed.ac.uk/mlap/resources/index.html, save as `phrase_similarities.txt`
2. Download the EasyCCG parser from http://homepages.inf.ed.ac.uk/s1049478/easyccg.html, unpack the package (you should get a catalog like `easyccg-0.2`). From the same page, download the regular pretrained model (`model.tar.gz`). Unpack the model to the parser's catalog.

# Getting the British National Corpus & the word list

We will parse BNC XML files with lxml. NLTK technically has a dedicated parser for BNC, which is extremely slow in the lazy mode, and in the non-lazy mode it is very slow and also consumes >8GB of memory.

In [6]:
bnc_path = 'BNC/Texts/'
from os.path import exists

def bnc_files_iter():
    top_level = ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'K']
    symbols = top_level + ['L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'V', 'X', 'Y', 'Z',
                           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for top in top_level:
        top_path = bnc_path + '/' + top
        if not exists(top_path):
            continue
        for symbol2 in symbols:
            path2 = top_path + '/' + top + symbol2
            if not exists(path2):
                continue
            for symbol3 in symbols:
                current_path = path2 + '/' + top + symbol2 + symbol3 + '.xml'
                if not exists(current_path):
                    continue
                yield open(current_path)

In [7]:
from lxml import etree

In [18]:
unique_words = set()

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            unique_words.add(element.text.strip())
    bnc_file.close()
    
unique_words = list(unique_words)
print(unique_words[:10])

['', 'bedewed', 'Chancellorship', 'primatu', 'foreign/non-national', 'Morulae', 'conqueror', 'Super-Bush', '£858', 'B.E.Ward']


In [19]:
unique_count = len(unique_words)
print(unique_count)

705241


In [20]:
# try stemming just for the embedding?
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
stemmed_words = [stemmer.stem(word) for word in unique_words]
stemmed_words = list(set(stemmed_words))
print(len(stemmed_words))

497225


# Getting CCG parse trees for BNC

In [3]:
# we will run the underlying parser as a subprocess, and intercept its outputs from within Python
from subprocess import Popen, PIPE, STDOUT
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
# .encode() gives bytes instead of str, as .communicate() requires. We get a pair (stdout, stderr):
(parse, err) = p.communicate(input='The cat chases a ball of yarn.\n'.encode())
print(parse, '\n', err)
p.terminate()

b'ID=1\n(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<L N POS POS cat N>) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/NP POS POS chases (S[dcl]\\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS ball N>) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS of (NP\\NP)/NP>) (<T NP 0 1> (<L N POS POS yarn. N>) ) ) ) ) ) \n' 
 b'Loading model...\nModel loaded, ready to parse.\n'


Let's see how NLTK can handle parse trees.

In [4]:
# some string cleanup
def clean_parse_output(parse_output):
    # (remember we have to deal with the parse returned as bytes, not a Unicode string)
    lines = str(parse_output).split('\\n')
    if len(lines) > 1:
        return lines[1] # the second line contains the parse itself
    else:
        return lines[0]

from nltk.tree import Tree
tree = Tree.fromstring(clean_parse_output(parse))
print(tree)

(<T
  S[dcl]
  1
  2>
  (<T
    NP[nb]
    0
    2>
    (<L NP[nb]/N POS POS The NP[nb]/N>)
    (<L N POS POS cat N>))
  (<T
    S[dcl]\\NP
    0
    2>
    (<L (S[dcl]\\NP ) /NP POS POS chases (S[dcl]\\NP ) /NP>)
    (<T
      NP[nb]
      0
      2>
      (<T
        NP[nb]
        0
        2>
        (<L NP[nb]/N POS POS a NP[nb]/N>)
        (<L N POS POS ball N>))
      (<T
        NP\\NP
        0
        2>
        (<L (NP\\NP ) /NP POS POS of (NP\\NP ) /NP>)
        (<T NP 0 1> (<L N POS POS yarn. N>))))))


It's not very pretty, because NLTK decides to print a newline instead of space inside the less/more than signs. In each (parenthesized expression), the first item (head) is the category of node, and two next items are its child nodes.

In [None]:
def traverse(tree):
    for node in tree:
        return

In [8]:
trees = []
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if element.tag == 's':
            sentence = ''
            for nested_element in element.iter():
                if (nested_element.tag == 'w' or nested_element.tag == 'c') and nested_element.text:
                    sentence += ' ' + nested_element.text
            parse_output = p.communicate(input=sentence.encode())[0]
            p.terminate()
            trees.append(clean_parse_output(parse_output))
            p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
    bnc_file.close()
    if len(trees) >= 3:
        break
   
print(trees[:3])
p.terminate()

["(<T S[dcl] 1 2> (<T NP 1 2> (<L LRB POS POS \\xe2\\x80\\x98 LRB>) (<T NP 0 1> (<L N POS POS Arrest N>) ) ) (<T S[dcl]\\\\NP 0 2> (<T S[dcl]\\\\NP 0 2> (<T (S[dcl]\\\\NP)/PP 0 2> (<L (S[dcl]\\\\NP)/PP POS POS warrant (S[dcl]\\\\NP)/PP>) (<L (S\\\\NP)\\\\(S\\\\NP) POS POS out (S\\\\NP)\\\\(S\\\\NP)>) ) (<T (S[X]\\\\NP)\\\\((S[X]\\\\NP)/PP) 0 1> (<T PP 0 2> (<L PP/NP POS POS for PP/NP>) (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Clowes N/N>) (<T N 1 2> (<L N/N POS POS \\xe2\\x80\\x99 N/N>) (<T N 1 2> (<L N/N POS POS partner N/N>) (<L N POS POS years N>) ) ) ) ) ) ) ) (<T (S\\\\NP)\\\\(S\\\\NP) 0 2> (<L ((S\\\\NP)\\\\(S\\\\NP))/NP POS POS before ((S\\\\NP)\\\\(S\\\\NP))/NP>) (<T NP 0 2> (<T NP 0 1> (<L N POS POS collapse' N>) ) (<L . POS POS . .>) ) ) ) ) ", '(<T NP 1 2> (<L NP/NP POS POS By NP/NP>) (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Daniel N/N>) (<L N POS POS John N>) ) ) ) ', '(<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<L N POS POS AWARRANT N>) ) (<T NP\\\\NP 0 2> (<L (NP\\\\NP)/NP POS P

## Learning word embeddings

Our embedding procedure will be based on this Tensorflow [word2vec tutorial](https://www.tensorflow.org/tutorials/word2vec).

In [52]:
# Consistently map each unique word to a integer.
word_map = {word: index for index, word in enumerate(unique_words) }

In [22]:
# Collect all words from the corpus, as their indices in the word map.
corpus_words = []

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            corpus_words.append(word_map[element.text.strip()])
    bnc_file.close()

Generate batches of pairs (context word, target word). For simplicity, we hardcode the window size (2) and number of examples in window.

In [50]:
from random import randint

num_samples = 4

def skipgrams_batch(batch_size):
    assert batch_size % num_samples == 0
    windows_n = batch_size // num_samples
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    for i in range(windows_n):
        target = randint(2, len(corpus_words)-3)
        for j in range(num_samples):
            labels[i*num_samples+j][0] = corpus_words[target]
        batch[i*num_samples] = corpus_words[target-2]
        batch[i*num_samples+1] = corpus_words[target-1]
        batch[i*num_samples+2] = corpus_words[target+1]
        batch[i*num_samples+3] = corpus_words[target+2]
        
    return batch, labels

print(skipgrams_batch(12))

(array([545647, 130645, 477349, 423144, 130645, 690827, 636390, 245211,
        26147, 153056, 451583, 401296], dtype=int32), array([[697585],
       [697585],
       [697585],
       [697585],
       [493937],
       [493937],
       [493937],
       [493937],
       [660882],
       [660882],
       [660882],
       [660882]], dtype=int32))


In [26]:
import tensorflow as tf
import math

In [27]:
vocabulary_size = len(unique_words)
embedding_size = 70

# Model parameters.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                              stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [60]:
batch_size = 128

# The computation graph.
inputs = tf.placeholder(tf.int32, shape=[batch_size])
labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
embedding_layer = tf.nn.embedding_lookup(embeddings, inputs)

# Note that word2vec has no "real" hidden layers apart from the embedding.

# Number of random words to sample apart from the true target; the model should learn to
# assign low probability to them given the context.
negative_samples_n = 64

loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=labels,
                                     inputs=embedding_layer,
                                     num_sampled=negative_samples_n,
                                     num_classes=vocabulary_size))
optimizer = tf.train.AdagradOptimizer(0.1).minimize(loss)

In [64]:
steps_n = 1000
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    for i in range(steps_n):
        batch_inputs, batch_labels = skipgrams_batch(batch_size)
        _, loss_val = sess.run([optimizer, loss], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
        if (i % 100 == 0):
            print(loss_val)

360.009
404.959
338.814
338.927
322.031
325.635
270.039
276.532
271.01
256.085


## Learning the transformation matrix

In [2]:
import numpy as np
from scipy.optimize import fmin_l_bfgs_b as l_bfgs

In [None]:
pairs_to_encode = []
def encoding_error(transf_matrix):
    pass