# Plumbing
1. Download the phrase similarity dataset from http://homepages.inf.ed.ac.uk/mlap/resources/index.html, save as `phrase_similarities.txt`
2. Download the EasyCCG parser from http://homepages.inf.ed.ac.uk/s1049478/easyccg.html, unpack the package (you should get a catalog like `easyccg-0.2`). From the same page, download the regular pretrained model (`model.tar.gz`). Unpack the model to the parser's catalog.

# Getting the British National Corpus & the word list

In [2]:
from nltk.corpus.reader.bnc import BNCCorpusReader
bnc = BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml', lazy=False) # https://github.com/nltk/nltk/issues/781
print(bnc)

<BNCCorpusReader in '/home/szymon/lingwy/trening_głębi/repo/01-The Role of Syntax in Vector Space Models (Hermann & Blunsom 2013)/BNC/Texts'>


In [3]:
print(bnc.sents()[:10])

[['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?'], ['AIDS', '(', 'Acquired', 'Immune', 'Deficiency', 'Syndrome', ')', 'is', 'a', 'condition', 'caused', 'by', 'a', 'virus', 'called', 'HIV', '(', 'Human', 'Immuno', 'Deficiency', 'Virus', ')', '.'], ['This', 'virus', 'affects', 'the', 'body', "'s", 'defence', 'system', 'so', 'that', 'it', 'can', 'not', 'fight', 'infection', '.'], ['How', 'is', 'infection', 'transmitted', '?'], ['through', 'unprotected', 'sexual', 'intercourse', 'with', 'an', 'infected', 'partner', '.'], ['through', 'infected', 'blood', 'or', 'blood', 'products', '.'], ['from', 'an', 'infected', 'mother', 'to', 'her', 'baby', '.'], ['It', 'is', 'not', 'transmitted', 'from', ':'], ['giving', 'blood/mosquito', 'bites/toilet', 'seats/kissing/from', 'normal', 'day-to-day', 'contact'], ['How', 'does', 'it', 'affect', 'you', '?']]


In [6]:
# A word->id mapping.
unique_words = set(bnc.words())

TypeError: 'set' object is not subscriptable

In [8]:
unique_words = list(unique_words)
print(unique_words[:10])
unique_count = len(unique_words)
print(unique_count)

['', 'overplayful', 'garage-less', 'TRINZIC', 'fileserver', 'M6', 'ultra-refined', '341,000,000', 'welldisciplined', 'shitkickers']
762481


In [9]:
# try stemming just for the embedding?
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
stemmed_words = [stemmer.stem(word) for word in unique_words]
stemmed_words = list(set(stemmed_words))
print(len(stemmed_words))

540129


# Getting CCG parse trees for BNC

In [17]:
# we will run the underlying parser as a subprocess, and intercept its outputs from within Python
from subprocess import Popen, PIPE, STDOUT
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
# .encode() gives bytes instead of str, as .communicate() requires. We get a pair (stdout, stderr):
(parse, err) = p.communicate(input='The cat chases a ball of yarn.\n'.encode())
print(parse, '\n', err)
p.terminate()

b'ID=1\n(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<L N POS POS cat N>) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/NP POS POS chases (S[dcl]\\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS ball N>) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS of (NP\\NP)/NP>) (<T NP 0 1> (<L N POS POS yarn. N>) ) ) ) ) ) \n' 
 b'Loading model...\nModel loaded, ready to parse.\n'


Let's see how NLTK can handle parse trees.

In [24]:
# some string cleanup
def clean_parse_output(parse_output):
    # (remember we have to deal with the parse returned as bytes, not a Unicode string)
    return str(parse_output).split('\\n')[1] # the second line contains the parse itself

from nltk.tree import Tree
tree = Tree.fromstring(clean_parse_output(parse))
print(tree)

(<T
  S[dcl]
  1
  2>
  (<T
    NP[nb]
    0
    2>
    (<L NP[nb]/N POS POS The NP[nb]/N>)
    (<L N POS POS cat N>))
  (<T
    S[dcl]\\NP
    0
    2>
    (<L (S[dcl]\\NP ) /NP POS POS chases (S[dcl]\\NP ) /NP>)
    (<T
      NP[nb]
      0
      2>
      (<T
        NP[nb]
        0
        2>
        (<L NP[nb]/N POS POS a NP[nb]/N>)
        (<L N POS POS ball N>))
      (<T
        NP\\NP
        0
        2>
        (<L (NP\\NP ) /NP POS POS of (NP\\NP ) /NP>)
        (<T NP 0 1> (<L N POS POS yarn. N>))))))


It's not very pretty, because NLTK decides to print a newline instead of space inside the less/more than signs. In each (parenthesized expression), the first item (head) is the category of node, and two next items are its child nodes.

In [None]:
#p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)

trees = []#[ None ] * len(bnc.sents())
for (sent_n, sent) in enumerate(bnc.sents()):
    input_sent = (' '.join(sent)+'\n').encode()
    #parse_out = p.communicate(input=input_sent)
    #trees[sent_n] = parse_str
    trees.append(input_sent)
    #if sent_n > 100:
    #    break

#p.terminate()
print(trees[:10])