Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
vpekar committed Sep 14, 2012
0 parents commit d7b28cc
Showing 1 changed file with 394 additions and 0 deletions.
394 changes: 394 additions & 0 deletions stanford.py
@@ -0,0 +1,394 @@
"""A Jython interface to the Stanford parser. Includes various utilities to manipulate
parsed sentences:
* parsing text containing XML tags,
* obtaining probabilities for different analyses,
* extracting dependency relations,
* extracting subtrees,
* finding the shortest path between two nodes,
* print the parse in various formats.
See examples after the if __name__ == "__main__" hooks.
INSTALLATION:
1. Download the parser from http://nlp.stanford.edu/downloads/lex-parser.shtml
2. Unpack into a local dir, put the path to stanford-parser.jar in the -cp arg in jython.bat
3. Put the path to englishPCFG.ser.gz as parser_file arg to StanfordParser
USAGE:
1. Produce an FDG-style of a parse (a table as a list of words with tags):
parser = StanfordParser()
To keep XML tags provided in the input text:
sentence = parser.parse('This is a test')
To strip all XML before parsing:
sentence = parser.parse_xml('This is a <b>test</b>.')
To print the sentence as a table (one word per line):
sentence.print_table()
To print the sentence as a parse tree:
sentence.print_tree()
2. Retrieve the 5 best parses with associated probabilities for the last-parsed sentence:
parser = StanfordParser()
sentence = parser.parse('This is a test')
for candidate_tree in parser.lp.getKBestPCFGParses(5):
print 'Prob:', math.e**candidate_tree.score()
print 'Tree:'
s = Sentence(parser.gsf, candidate_tree.object())
s.print_table()
On input, the script accepts unicode or utf8 or latin1.
On output, the script produces unicode.
"""

import sys, re, string, math

try:
assert 'java' in sys.platform
except AssertionError:
raise Exception("The script should be run from Jython!")

from java.util import *
from edu.stanford.nlp.trees import PennTreebankLanguagePack, TreePrint
from edu.stanford.nlp.parser.lexparser import LexicalizedParser
from edu.stanford.nlp.process import Morphology, PTBTokenizer, WordTokenFactory
from java.io import StringReader


def stanford2tt(sentence):
"""Given a Sentence object, return TreeTagger-style tuples (word, tag, lemma).
"""
for k in sorted(sentence.word):
word = sentence.word.get(k, '')
if word.startswith('<'):
tag, lemma = 'XML', word
else:
tag = sentence.tag.get(k, '')
lemma = sentence.lemma.get(k, '')
# correcting: TO -> IN
if word == 'to' and tag == 'TO':
tag = 'IN'
yield (word, tag, lemma)


class Sentence:
"""An interface to the grammaticalStructure object of SP
"""

def __init__(self, gsf, parse, xmltags={}):
"""Create a Sentence object from parse.
@param gsf: a grammaticalStructureFactory object
@param parse: a parse of the sentence
@param xmltags: index of the previous text token => list of intervening xmltags
"""
self.parse = parse
self.gs = gsf.newGrammaticalStructure(parse)
self.lemmer = Morphology()
self.xmltags = xmltags

# create indices
self.node = {}
self.word = {}
self.tag = {}
self.lemma = {}
self.dep = {}
self.rel = {}
self.children = {}

# insert the tags before the text, if any are present before the text
if 0 in self.xmltags:
num_tags = len(self.xmltags[0])
for idx in xrange(num_tags):
tag_idx = (idx+1)/float(num_tags+1)
self.word[tag_idx] = self.xmltags[0][idx].decode('latin1')

# iterate over text tokens
for i in self.gs.getNodes():
if i.headTagNode() != None: continue
idx = i.index()
word = i.value().decode('latin1')

# correction
if word == '-RRB-': word = u'('
elif word == '-LRB-': word = u')'

parent = i.parent()
tag = u'Z' if parent == None else parent.value().decode('latin1')
lemma = self.lemmer.lemmatize(self.lemmer.stem(word, tag)).lemma().decode('latin1')
p = self.gs.getGovernor(i)
if word in string.punctuation or p == None:
p_idx = 0
rel = 'punct'
else:
p_idx = p.index()
rel = str(self.gs.getGrammaticalRelation(p_idx, idx))

self.node[idx] = i
self.word[idx] = word
self.tag[idx] = tag
self.lemma[idx] = lemma
self.rel[idx] = rel
self.dep[idx] = p_idx
self.children[p_idx] = self.children.get(p_idx,[])
self.children[p_idx].append( idx )

# insert xml tags, if any
if idx in self.xmltags:
num_tags = len(self.xmltags[idx])
for t_num in xrange(num_tags):
tag_idx = (t_num+1)/float(num_tags+1)
self.word[idx+tag_idx] = self.xmltags[idx][t_num].decode('latin1')

def get_head(self, node):
"""Return a tuple with the head of the dependency for a node and the
relation label.
"""
idx = node.index()
dep_idx = self.dep.get(idx)
if not dep_idx: return None, None
return (self.node.get(dep_idx), self.rel.get(idx))

def get_children(self,node):
"""Yield tuples each with a child of the dependency
and the relation label
"""
for i in self.children.get(node.index(), []):
yield (self.node[i], self.rel[i])

def descendants(self,idx):
"""Return all descendants of a node, including the node itself
"""
global descendants
descendants = [idx]
def traverse(idx):
global descendants
for i in self.children.get(idx, []):
descendants.append(i)
traverse(i)
traverse(idx)
return descendants

def prune(self,idx):
"""Given an index, remove all the words dependent on the word with that index,
including the word itself.
"""
for i in self.descendants(idx):
self.delete_node(i)

def delete_node(self,i):
del self.node[i], self.word[i], self.tag[i], self.lemma[i], self.rel[i], self.dep[i]
if i in self.children:
del self.children[i]

def get_plain_text(self):
"""Output plain-text sentence.
"""
text = ' '.join([self.word[i] for i in sorted(self.node)])
# remove spaces in front of commas, etc
for i in ',.:;!?':
text = text.replace(' ' + i, i)
return text

def least_common_node(self,n,m):
"""Return a node that is least common for two given nodes,
as well as the shortest path between the two nodes
@param n: index of node 1
@param m: index of node 2
"""

common_node = None
shortest_path = []
path1 = self.path2root(m)
path2 = self.path2root(n)

for i in path1:
if common_node != None: break
for j in path2:
if i == j:
common_node = i
break

if common_node != None:
for i in path1:
shortest_path.append(i)
if i == common_node: break
for i in path2:
if i == common_node: break
shortest_path.append(i)

return common_node, shortest_path

def path2root(self, i):
"""The path to the root from a node.
@param i: the index of the node
"""
path = [i]
if i != 0:
while 1:
p = self.dep.get(i)
if not p: break
path.append(p)
i = p
return path

def print_table(self):
"""Print the parse as a table, FDG-style, to STDOUT
"""
for i in sorted(self.word):
line = '\t'.join([
self.word.get(i,'')
self.lemma.get(i,'')
self.tag.get(i,'')
self.rel.get(i,'')
self.dep.get(i,'')
])
print line.encode('utf8')

def print_tree(self, mode='penn'):
"""Prints the parse.
@param mode: penn/typedDependenciesCollapsed/etc
"""
tp = TreePrint(mode)
tp.printTree(self.parse)

class StanfordParser:

TAG = re.compile(r'<[^>]+>')

def __init__(self, parser_file,
parser_options=['-maxLength', '80', '-retainTmpSubcategories']):
"""@param parser_file: path to the serialised parser model (e.g. englishPCFG.ser.gz)
@param parser_options: options
"""
self.lp = LexicalizedParser(parser_file)
self.lp.setOptionFlags(parser_options)
tlp = PennTreebankLanguagePack()
self.gsf = tlp.grammaticalStructureFactory()
self.wtf = WordTokenFactory()

def parse(self, s):
"""Strips XML tags first.
@param s: the sentence to be parsed, as a string
@return: a Sentence object
"""
# strip xml tags
s = self.TAG.sub('', s)

parse = self.lp.apply(s)
return Sentence(self.gsf, parse)

def parse_xml(self,s):
"""Tokenise the XML text, remember XML positions, and then parse it.
"""

# tokenise the text
r = StringReader(s)
tokeniser = PTBTokenizer(r, False, self.wtf)
alist = tokeniser.tokenize()

# build a plain-text token list and remember tag positions
tags = {}
sent = []
for i in alist:
token = str(i)
if token.startswith('<'):
cur_size = len(sent)
tags[cur_size] = tags.get(cur_size,[])
tags[cur_size].append(token)
else:
sent.append(token)

# parse
parse = self.lp.apply(Arrays.asList(sent))

return Sentence(self.gsf, parse, tags)


if __name__ == '__main__':

sp = StanfordParser(r'C:\soft\stanford\stanford-parser-2008-10-26\englishPCFG.ser.gz')

print 'Parsing XML text\n'
s = 'This is an <tag attr="term">example<!-- this is a comment --></tag>.'
print 'IN:', s
sentence = sp.parse_xml(s)
print 'OUT:'
sentence.print_table()
print '-'*80

print 'Output formats\n'
s = 'This is an example sentence.'
print 'IN:', s
sentence = sp.parse_xml(s)
print 'TABLE:'
sentence.print_table()
print '\nTREE:'
sentence.print_tree()
print '\nTT FORMAT:'
for i in stanford2tt(sentence):
print i
print '-'*80

print 'Parse probabilities\n'
s = 'I saw a man with a telescope.'
print 'IN:', s
for candidate_tree in sp.lp.getKBestPCFGParses(1):
print 'Probability:', math.e**candidate_tree.score()
print 'Tree:'
s = Sentence(sp.gsf, candidate_tree.object())
s.print_table()
print '-'*50
print '-'*80

"""
print
print 'Subtrees:\n'
for subtree in sentence.parse.subTrees():
print subtree
print '-'*50
print '-'*80
"""

print 'Dependencies\n'
for td in sentence.gs.allTypedDependencies():
gov = td.gov()
gov_idx = gov.index()
dep = td.dep()
dep_idx = dep.index()
rel = td.reln()
print 'Governing word:',gov.value()
print 'Its index:',gov_idx
print 'Dependency word:',dep.value()
print 'Its index:',dep_idx
print '-'*50
print '-'*80

"""
# paths between every pair of content words
content = []
for i in sentence.gs.getNodes():
if i.headTagNode() != None: continue
idx = i.index()
word = i.value()
tag = i.parent().value()
if tag[0] in ['V','N','J','R']:
content.append(i)
for i in content:
for j in content:
if i == j: continue
lcn, shortest_path = sentence.least_common_node(i.index(), j.index())
print 'LCN: %s and %s: %s' % (i, j, lcn)
print 'Path:', shortest_path
print '-'*50
"""

0 comments on commit d7b28cc

Please sign in to comment.