In [None]:
# Plumbing
1. Download the phrase similarity dataset from http://homepages.inf.ed.ac.uk/mlap/resources/index.html, save as `phrase_similarities.txt`
2. Download the EasyCCG parser from http://homepages.inf.ed.ac.uk/s1049478/easyccg.html, unpack the package (you should get a catalog like `easyccg-0.2`). From the same page, download the regular pretrained model (`model.tar.gz`). Unpack the model to the parser's catalog.

# Getting the British National Corpus & the word list

We will parse BNC XML files with lxml. NLTK technically has a dedicated parser for BNC, which is extremely slow in the lazy mode, and in the non-lazy mode it is very slow and also consumes >8GB of memory.

In [1]:
bnc_path = 'BNC/Texts/'
from os.path import exists

def bnc_files_iter():
    top_level = ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'K']
    symbols = top_level + ['L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'V', 'X', 'Y', 'Z',
                           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for top in top_level:
        top_path = bnc_path + '/' + top
        if not exists(top_path):
            continue
        for symbol2 in symbols:
            path2 = top_path + '/' + top + symbol2
            if not exists(path2):
                continue
            for symbol3 in symbols:
                current_path = path2 + '/' + top + symbol2 + symbol3 + '.xml'
                if not exists(current_path):
                    continue
                yield open(current_path)

In [8]:
unique_words = set()
from lxml import etree

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            unique_words.add(element.text)
    bnc_file.close()
    
unique_words = list(unique_words)
print(unique_words[:10])

['Bindman ', 'Fazio ', 'Lancake ', 'McWILLIAM ', 'Cacioppo', 'middest ', '72.9 ', '913,000 ', 'concordance ', '96,000']


In [3]:
unique_count = len(unique_words)
print(unique_count)

948223


In [7]:
# try stemming just for the embedding?
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
stemmed_words = [stemmer.stem(word) for word in unique_words]
stemmed_words = list(set(stemmed_words))
print(len(stemmed_words))

743062


# Getting CCG parse trees for BNC

In [10]:
# we will run the underlying parser as a subprocess, and intercept its outputs from within Python
from subprocess import Popen, PIPE, STDOUT
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
# .encode() gives bytes instead of str, as .communicate() requires. We get a pair (stdout, stderr):
(parse, err) = p.communicate(input='The cat chases a ball of yarn.\n'.encode())
print(parse, '\n', err)
p.terminate()

b'ID=1\n(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<L N POS POS cat N>) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/NP POS POS chases (S[dcl]\\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS ball N>) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS of (NP\\NP)/NP>) (<T NP 0 1> (<L N POS POS yarn. N>) ) ) ) ) ) \n' 
 b'Loading model...\nModel loaded, ready to parse.\n'


Let's see how NLTK can handle parse trees.

In [24]:
# some string cleanup
def clean_parse_output(parse_output):
    # (remember we have to deal with the parse returned as bytes, not a Unicode string)
    return str(parse_output).split('\\n')[1] # the second line contains the parse itself

from nltk.tree import Tree
tree = Tree.fromstring(clean_parse_output(parse))
print(tree)

(<T
  S[dcl]
  1
  2>
  (<T
    NP[nb]
    0
    2>
    (<L NP[nb]/N POS POS The NP[nb]/N>)
    (<L N POS POS cat N>))
  (<T
    S[dcl]\\NP
    0
    2>
    (<L (S[dcl]\\NP ) /NP POS POS chases (S[dcl]\\NP ) /NP>)
    (<T
      NP[nb]
      0
      2>
      (<T
        NP[nb]
        0
        2>
        (<L NP[nb]/N POS POS a NP[nb]/N>)
        (<L N POS POS ball N>))
      (<T
        NP\\NP
        0
        2>
        (<L (NP\\NP ) /NP POS POS of (NP\\NP ) /NP>)
        (<T NP 0 1> (<L N POS POS yarn. N>))))))


It's not very pretty, because NLTK decides to print a newline instead of space inside the less/more than signs. In each (parenthesized expression), the first item (head) is the category of node, and two next items are its child nodes.

In [18]:
trees = []
p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if element.tag == 's':
            sentence = ''
            for nested_element in element.iter():
                if (nested_element.tag == 'w' or nested_element.tag == 'c') and nested_element.text:
                    sentence += ' ' + nested_element.text
            parse_out = p.communicate(input=sentence.encode())[0]
            p.terminate()
            print(sentence)
            print(parse_out)
            trees.append(parse_out)
            p = Popen(['java', '-jar', 'easyccg-0.2/easyccg.jar', '--model', 'easyccg-0.2/model'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
    bnc_file.close()
    if(len(trees) == 5):
        break
   
print(trees[:5])
p.terminate()

 ‘ Arrest  warrant  out  for  Clowes ’  partner  years  before  collapse' .
b"ID=1\n(<T S[dcl] 1 2> (<T NP 1 2> (<L LRB POS POS \xe2\x80\x98 LRB>) (<T NP 0 1> (<L N POS POS Arrest N>) ) ) (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2> (<T (S[dcl]\\NP)/PP 0 2> (<L (S[dcl]\\NP)/PP POS POS warrant (S[dcl]\\NP)/PP>) (<L (S\\NP)\\(S\\NP) POS POS out (S\\NP)\\(S\\NP)>) ) (<T (S[X]\\NP)\\((S[X]\\NP)/PP) 0 1> (<T PP 0 2> (<L PP/NP POS POS for PP/NP>) (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Clowes N/N>) (<T N 1 2> (<L N/N POS POS \xe2\x80\x99 N/N>) (<T N 1 2> (<L N/N POS POS partner N/N>) (<L N POS POS years N>) ) ) ) ) ) ) ) (<T (S\\NP)\\(S\\NP) 0 2> (<L ((S\\NP)\\(S\\NP))/NP POS POS before ((S\\NP)\\(S\\NP))/NP>) (<T NP 0 2> (<T NP 0 1> (<L N POS POS collapse' N>) ) (<L . POS POS . .>) ) ) ) ) \n"
 By  Daniel  John
b'ID=1\n(<T NP 1 2> (<L NP/NP POS POS By NP/NP>) (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Daniel N/N>) (<L N POS POS John N>) ) ) ) \n'
 AWARRANT  for  the  arrest  of  the  former  partner 

 In  his  report  published  two  days  ago  in  which  he  accused  the  Department  of  Trade  and  Industry  of  ‘ substantial  maladministration ’  over  the  Barlow  Clowes  affair ,  Sir  Anthony  wrote :  ‘ He  ( the  liquidator )  believed  that  the  warrant  for  her  arrest  still  remained  outstanding . ’
b'ID=1\n(<T S[dcl] 1 2> (<T S/S 0 2> (<L (S/S)/NP POS POS In (S/S)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS his NP[nb]/N>) (<L N POS POS report N>) ) (<T NP\\NP 0 1> (<T S[pss]\\NP 0 2> (<L S[pss]\\NP POS POS published S[pss]\\NP>) (<T (S\\NP)\\(S\\NP) 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS two N/N>) (<L N POS POS days N>) ) ) (<L ((S\\NP)\\(S\\NP))\\NP POS POS ago ((S\\NP)\\(S\\NP))\\NP>) ) ) ) ) (<T NP\\NP 0 2> (<T (NP\\NP)/S[dcl] 1 2> (<L (NP\\NP)/NP POS POS in (NP\\NP)/NP>) (<L ((NP\\NP)/S[dcl])\\((NP\\NP)/NP) POS POS which ((NP\\NP)/S[dcl])\\((NP\\NP)/NP)>) ) (<T S[dcl] 1 2> (<L NP POS POS he NP>) (<T S[dcl]\\NP 0 2> (<T (S[dcl]\

 It  is  understood  that  Farrington  Stead  was  set  up  by  former  employees  of  Barlow  Clowes .
b'ID=1\n(<T S[dcl] 1 2> (<L NP POS POS It NP>) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/(S[pss]\\NP) POS POS is (S[dcl]\\NP)/(S[pss]\\NP)>) (<T S[pss]\\NP 0 2> (<L (S[pss]\\NP)/S[em] POS POS understood (S[pss]\\NP)/S[em]>) (<T S[em] 0 2> (<L S[em]/S[dcl] POS POS that S[em]/S[dcl]>) (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Farrington N/N>) (<L N POS POS Stead N>) ) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/(S[pss]\\NP) POS POS was (S[dcl]\\NP)/(S[pss]\\NP)>) (<T S[pss]\\NP 0 2> (<T S[pss]\\NP 0 2> (<L S[pss]\\NP POS POS set S[pss]\\NP>) (<L (S\\NP)\\(S\\NP) POS POS up (S\\NP)\\(S\\NP)>) ) (<T (S\\NP)\\(S\\NP) 0 2> (<L ((S\\NP)\\(S\\NP))/NP POS POS by ((S\\NP)\\(S\\NP))/NP>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS former N/N>) (<L N POS POS employees N>) ) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS of (NP\\NP)/NP>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Bar

 Mr  Peter  Lucas ,  a  director  of  Bond  Corporation ,  named  the  man  as  Mr  David  Michael ,  who ,  he  said ,  had  been  responsible  for  top-level  corporation  security  until  his  dismissal  last  October .
b'ID=1\n(<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Mr N/N>) (<T N 1 2> (<L N/N POS POS Peter N/N>) (<L N POS POS Lucas N>) ) ) ) (<T NP[nb]\\NP[nb] 1 2> (<L , POS POS , ,>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS director N>) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS of (NP\\NP)/NP>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Bond N/N>) (<L N POS POS Corporation N>) ) ) (<L , POS POS , ,>) ) ) ) ) ) (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2> (<T (S[dcl]\\NP)/PP 0 2> (<T (S[dcl]\\NP)/PP 0 2> (<L ((S[dcl]\\NP)/PP)/NP POS POS named ((S[dcl]\\NP)/PP)/NP>) (<T NP[nb] 0 2> (<L NP[nb]/N POS POS the NP[nb]/N>) (<L N POS POS man N>) ) ) (<T (S\\NP)\\(S\\NP) 0 2> (<L ((S\\NP)\\(S\\NP))/S[dcl] POS POS as ((S\\

 This  year  Michael  had  received  a  special  briefing  from  the  Bond  Corporation  to  check  the  source  of  leaks  of  some  highly  confidential  commercial  information .
b'ID=1\n(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS This NP[nb]/N>) (<T N 1 2> (<L N/N POS POS year N/N>) (<L N POS POS Michael N>) ) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/(S[pt]\\NP) POS POS had (S[dcl]\\NP)/(S[pt]\\NP)>) (<T S[pt]\\NP 0 2> (<L (S[pt]\\NP)/NP POS POS received (S[pt]\\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<T N 1 2> (<L N/N POS POS special N/N>) (<L N POS POS briefing N>) ) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS from (NP\\NP)/NP>) (<T NP[nb] 0 2> (<L NP[nb]/N POS POS the NP[nb]/N>) (<T N 1 2> (<L N/N POS POS Bond N/N>) (<T N 0 2> (<L N POS POS Corporation N>) (<T N\\N 0 1> (<T S[to]\\NP 0 2> (<L (S[to]\\NP)/(S[b]\\NP) POS POS to (S[to]\\NP)/(S[b]\\NP)>) (<T S[b]\\NP 0 2> (<L (S[b]\\NP)/NP POS POS check (S[b]\\NP)/NP>) (<T NP[nb] 0 2> (<T NP[

 Junk  bond  setback .
b'ID=1\n(<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Junk N/N>) (<T N 1 2> (<L N/N POS POS bond N/N>) (<L N POS POS setback N>) ) ) ) (<L . POS POS . .>) ) \n'
 The  US  junk  bond  market  suffered  its  latest  setback  yesterday  after  buyout  firm  Kohlberg  Kravis  Roberts  said  one  of  its  companies  was  on  the  verge  of  bankruptcy ,  writes  Mary  Brasier  in  New  York .
b'ID=1\n(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<T N 1 2> (<L N/N POS POS US N/N>) (<T N 1 2> (<L N/N POS POS junk N/N>) (<T N 1 2> (<L N/N POS POS bond N/N>) (<L N POS POS market N>) ) ) ) ) (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/NP POS POS suffered (S[dcl]\\NP)/NP>) (<T NP[nb] 0 2> (<L NP[nb]/N POS POS its NP[nb]/N>) (<T N 1 2> (<L N/N POS POS latest N/N>) (<L N POS POS setback N>) ) ) ) (<L (S\\NP)\\(S\\NP) POS POS yesterday (S\\NP)\\(S\\NP)>) ) (<T (S\\NP)\\(S\\NP) 0 2> (<L ((S\\NP)\\(S\\NP))/S[dcl] POS P

 More  large  pay  rises  for  directors  have  been  revealed .
b'ID=1\n(<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<T N/N 1 2> (<L (N/N)/(N/N) POS POS More (N/N)/(N/N)>) (<L N/N POS POS large N/N>) ) (<T N 1 2> (<L N/N POS POS pay N/N>) (<L N POS POS rises N>) ) ) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS for (NP\\NP)/NP>) (<T NP 0 1> (<L N POS POS directors N>) ) ) ) (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/(S[pt]\\NP) POS POS have (S[dcl]\\NP)/(S[pt]\\NP)>) (<T S[pt]\\NP 0 2> (<L (S[pt]\\NP)/(S[pss]\\NP) POS POS been (S[pt]\\NP)/(S[pss]\\NP)>) (<T S[pss]\\NP 0 2> (<L S[pss]\\NP POS POS revealed S[pss]\\NP>) (<L . POS POS . .>) ) ) ) ) \n'
 Trafalgar  House  raised  chief  executive  Eric  Parker 's  salary  by  62.7  per  cent  to  £480,000  and  chairman  Sir  Nigel  Broackes '  by  37.6  per  cent  to  £300,000 .
b"ID=1\n(<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Trafalgar N/N>) (<L N POS POS House N>) ) ) (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2

 Unit  trust  boost .
b'ID=1\n(<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Unit N/N>) (<T N 1 2> (<L N/N POS POS trust N/N>) (<L N POS POS boost N>) ) ) ) (<L . POS POS . .>) ) \n'
 Net  new  investment  in  unit  trusts  last  month  was  £339.4million ,  some  30  per  cent  higher  than  the  monthly  average  so  far  this  year .
b'ID=1\n(<T S[dcl] 1 2> (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS Net N/N>) (<T N 1 2> (<L N/N POS POS new N/N>) (<L N POS POS investment N>) ) ) ) (<T NP\\NP 0 2> (<L (NP\\NP)/NP POS POS in (NP\\NP)/NP>) (<T NP 0 2> (<T NP 0 1> (<T N 1 2> (<L N/N POS POS unit N/N>) (<L N POS POS trusts N>) ) ) (<T NP\\NP 1 2> (<L (NP\\NP)/(NP\\NP) POS POS last (NP\\NP)/(NP\\NP)>) (<L NP\\NP POS POS month NP\\NP>) ) ) ) ) (<T S[dcl]\\NP 0 2> (<T S[dcl]\\NP 0 2> (<L (S[dcl]\\NP)/NP POS POS was (S[dcl]\\NP)/NP>) (<T NP 0 2> (<T NP 0 1> (<L N POS POS \xc2\xa3339.4million N>) ) (<T NP[nb]\\NP[nb] 1 2> (<L , POS POS , ,>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[n

KeyboardInterrupt: 