In [None]:
import nltk
nltk.download('all')

---

Example on `ieer` corpus.

In [2]:
import re, nltk

# Search for strings that contain the word "in".

# \b matches the empty string, but only at the beginning or end of a word. (b = boundary)
'''Negative lookahead assertion(?<= ...). Matches if ... doesnt match next. 
To disregard the strings such as success in supervising, where in is followed by a gerund.'''

IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')

# Using the documents from the IEEE Corpus - New York Times, 15 March 1998.
# (see details here: http://www.nltk.org/_modules/nltk/corpus/reader/ieer.html)
docs = nltk.corpus.ieer.parsed_docs('NYT_19980315')

for doc in docs:
  for rel in nltk.sem.relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
    print(nltk.sem.relextract.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


---
Example on a custom corpus from NLTK library, which does appear to work fine.

In [3]:
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
    sent = nltk.ne_chunk(sent)
    rels = nltk.sem.relextract.extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
    for rel in rels:
        print('{0:<5}{1}'.format(i, nltk.sem.relextract.rtuple(rel)))

1    [PER: 'Vinken/NNP'] 'is/VBZ chairman/NN of/IN' [ORG: 'Elsevier/NNP']
254  [PER: 'Shugart/NNP'] ',/, currently/RB chairman/NN of/IN' [ORG: 'Seagate/NNP Technology/NNP']
325  [PER: 'George/NNP Foot/NNP'] ',/, a/DT managing/VBG partner/NN at/IN' [ORG: 'Newgate/NNP Management/NNP Associates/NNP']
331  [PER: 'Michael/NNP Porter/NNP'] ',/, an/DT analyst/NN at/IN' [ORG: 'Smith/NNP Barney/NNP']
391  [PER: 'Elliott/NNP Platt/NNP'] ',/, an/DT economist/NN at/IN' [ORG: 'Donaldson/NNP']
485  [PER: 'Arafat/NNP'] 'has/VBZ written/VBN to/TO the/DT chairman/NN of/IN the/DT' [ORG: 'International/NNP']
624  [PER: 'Stephen/NNP Salmore/NNP'] ',/, a/DT political/JJ scientist/NN at/IN' [ORG: 'New/NNP Jersey/NNP']
891  [PER: 'George/NNP Jennison/NNP'] ',/, head/JJ trader/NN of/IN banking/NN issues/NNS in/IN' [ORG: 'Shearson/NNP']
928  [PER: 'Neal/NNP R./NNP Sonnett/NNP'] ',/, president/NN of/IN the/DT' [ORG: 'National/NNP Association/NNP']
1065 [PER: 'Michael/NNP Stark/NNP'] ',/, chip/NN analyst/NN at/I

---

Example on a custom sentence. (This doesn't find any relations)



In [4]:
WORK = re.compile(r'.*\bworks\b.*')

sent = 'Tom works with James.'
tokenized_sent = nltk.word_tokenize(sent)
tagged_sent = nltk.pos_tag(tokenized_sent)
chunked = nltk.ne_chunk(tagged_sent)

nltk.sem.relextract.extract_rels('PER', 'PER', chunked, corpus='ace', pattern=WORK)

[]

---

Example on a custom sentence. (This finds the relation correctly)

In [5]:
sent = 'Tom works with James.'
tokenized_sent = nltk.word_tokenize(sent)
tagged_sent = nltk.pos_tag(tokenized_sent)
chunked = nltk.ne_chunk(tagged_sent)

subjclass = 'PERSON'
objclass = 'PERSON'
pattern = re.compile(r'.*\bworks\b.*')

# Group a chunk structure into a list of 'semi-relations'.
pairs = nltk.sem.relextract.tree2semi_rel(chunked)

# Convert 'semi-relations' into a dictionary which stores information 
# about the subject and object NEs plus the filler between them.
reldicts = nltk.sem.relextract.semi_rel2reldict(pairs + [[[]]])

# Filter relevant relations by matching the regexp pattern.
relfilter = lambda x: (x['subjclass'] == subjclass and
                           pattern.match(x['filler']) and
                           x['objclass'] == objclass)
rels = list(filter(relfilter, reldicts))

# Print the relations found in the text.
for rel in rels:
  print(nltk.sem.relextract.rtuple(rel))

[PER: 'Tom/NNP'] 'works/VBZ with/IN' [PER: 'James/NNP']


---

Another example on a custom sentence. (This correctly doesn't find any relations)

In [6]:
sent = 'Tom works at Microsoft.'
tokenized_sent = nltk.word_tokenize(sent)
tagged_sent = nltk.pos_tag(tokenized_sent)
chunked = nltk.ne_chunk(tagged_sent)

subjclass = 'PERSON'
objclass = 'PERSON'
pattern = re.compile(r'.*\bworks\b.*')

# Group a chunk structure into a list of 'semi-relations'.
pairs = nltk.sem.relextract.tree2semi_rel(chunked)

# Convert 'semi-relations' into a dictionary which stores information 
# about the subject and object NEs plus the filler between them.
reldicts = nltk.sem.relextract.semi_rel2reldict(pairs + [[[]]])

# Filter relevant relations by matching the regexp pattern.
relfilter = lambda x: (x['subjclass'] == subjclass and
                           pattern.match(x['filler']) and
                           x['objclass'] == objclass)
rels = list(filter(relfilter, reldicts))

# Print the relations found in the text.
for rel in rels:
  print(nltk.sem.relextract.rtuple(rel))