Permalink
Browse files

Initial commit

  • Loading branch information...
0 parents commit d7b28cc46d6fcae23a023df0aabb773a24e39492 @vpekar committed Sep 14, 2012
Showing with 394 additions and 0 deletions.
  1. +394 −0 stanford.py
@@ -0,0 +1,394 @@
+"""A Jython interface to the Stanford parser. Includes various utilities to manipulate
+parsed sentences:
+* parsing text containing XML tags,
+* obtaining probabilities for different analyses,
+* extracting dependency relations,
+* extracting subtrees,
+* finding the shortest path between two nodes,
+* print the parse in various formats.
+
+See examples after the if __name__ == "__main__" hooks.
+
+INSTALLATION:
+
+ 1. Download the parser from http://nlp.stanford.edu/downloads/lex-parser.shtml
+ 2. Unpack into a local dir, put the path to stanford-parser.jar in the -cp arg in jython.bat
+ 3. Put the path to englishPCFG.ser.gz as parser_file arg to StanfordParser
+
+USAGE:
+
+1. Produce an FDG-style of a parse (a table as a list of words with tags):
+
+ parser = StanfordParser()
+
+ To keep XML tags provided in the input text:
+
+ sentence = parser.parse('This is a test')
+
+ To strip all XML before parsing:
+
+ sentence = parser.parse_xml('This is a <b>test</b>.')
+
+ To print the sentence as a table (one word per line):
+
+ sentence.print_table()
+
+ To print the sentence as a parse tree:
+
+ sentence.print_tree()
+
+2. Retrieve the 5 best parses with associated probabilities for the last-parsed sentence:
+
+ parser = StanfordParser()
+ sentence = parser.parse('This is a test')
+ for candidate_tree in parser.lp.getKBestPCFGParses(5):
+ print 'Prob:', math.e**candidate_tree.score()
+ print 'Tree:'
+ s = Sentence(parser.gsf, candidate_tree.object())
+ s.print_table()
+
+On input, the script accepts unicode or utf8 or latin1.
+On output, the script produces unicode.
+"""
+
+import sys, re, string, math
+
+try:
+ assert 'java' in sys.platform
+except AssertionError:
+ raise Exception("The script should be run from Jython!")
+
+from java.util import *
+from edu.stanford.nlp.trees import PennTreebankLanguagePack, TreePrint
+from edu.stanford.nlp.parser.lexparser import LexicalizedParser
+from edu.stanford.nlp.process import Morphology, PTBTokenizer, WordTokenFactory
+from java.io import StringReader
+
+
+def stanford2tt(sentence):
+ """Given a Sentence object, return TreeTagger-style tuples (word, tag, lemma).
+ """
+ for k in sorted(sentence.word):
+ word = sentence.word.get(k, '')
+ if word.startswith('<'):
+ tag, lemma = 'XML', word
+ else:
+ tag = sentence.tag.get(k, '')
+ lemma = sentence.lemma.get(k, '')
+ # correcting: TO -> IN
+ if word == 'to' and tag == 'TO':
+ tag = 'IN'
+ yield (word, tag, lemma)
+
+
+class Sentence:
+ """An interface to the grammaticalStructure object of SP
+ """
+
+ def __init__(self, gsf, parse, xmltags={}):
+ """Create a Sentence object from parse.
+ @param gsf: a grammaticalStructureFactory object
+ @param parse: a parse of the sentence
+ @param xmltags: index of the previous text token => list of intervening xmltags
+ """
+ self.parse = parse
+ self.gs = gsf.newGrammaticalStructure(parse)
+ self.lemmer = Morphology()
+ self.xmltags = xmltags
+
+ # create indices
+ self.node = {}
+ self.word = {}
+ self.tag = {}
+ self.lemma = {}
+ self.dep = {}
+ self.rel = {}
+ self.children = {}
+
+ # insert the tags before the text, if any are present before the text
+ if 0 in self.xmltags:
+ num_tags = len(self.xmltags[0])
+ for idx in xrange(num_tags):
+ tag_idx = (idx+1)/float(num_tags+1)
+ self.word[tag_idx] = self.xmltags[0][idx].decode('latin1')
+
+ # iterate over text tokens
+ for i in self.gs.getNodes():
+ if i.headTagNode() != None: continue
+ idx = i.index()
+ word = i.value().decode('latin1')
+
+ # correction
+ if word == '-RRB-': word = u'('
+ elif word == '-LRB-': word = u')'
+
+ parent = i.parent()
+ tag = u'Z' if parent == None else parent.value().decode('latin1')
+ lemma = self.lemmer.lemmatize(self.lemmer.stem(word, tag)).lemma().decode('latin1')
+ p = self.gs.getGovernor(i)
+ if word in string.punctuation or p == None:
+ p_idx = 0
+ rel = 'punct'
+ else:
+ p_idx = p.index()
+ rel = str(self.gs.getGrammaticalRelation(p_idx, idx))
+
+ self.node[idx] = i
+ self.word[idx] = word
+ self.tag[idx] = tag
+ self.lemma[idx] = lemma
+ self.rel[idx] = rel
+ self.dep[idx] = p_idx
+ self.children[p_idx] = self.children.get(p_idx,[])
+ self.children[p_idx].append( idx )
+
+ # insert xml tags, if any
+ if idx in self.xmltags:
+ num_tags = len(self.xmltags[idx])
+ for t_num in xrange(num_tags):
+ tag_idx = (t_num+1)/float(num_tags+1)
+ self.word[idx+tag_idx] = self.xmltags[idx][t_num].decode('latin1')
+
+ def get_head(self, node):
+ """Return a tuple with the head of the dependency for a node and the
+ relation label.
+ """
+ idx = node.index()
+ dep_idx = self.dep.get(idx)
+ if not dep_idx: return None, None
+ return (self.node.get(dep_idx), self.rel.get(idx))
+
+ def get_children(self,node):
+ """Yield tuples each with a child of the dependency
+ and the relation label
+ """
+ for i in self.children.get(node.index(), []):
+ yield (self.node[i], self.rel[i])
+
+ def descendants(self,idx):
+ """Return all descendants of a node, including the node itself
+ """
+ global descendants
+ descendants = [idx]
+ def traverse(idx):
+ global descendants
+ for i in self.children.get(idx, []):
+ descendants.append(i)
+ traverse(i)
+ traverse(idx)
+ return descendants
+
+ def prune(self,idx):
+ """Given an index, remove all the words dependent on the word with that index,
+ including the word itself.
+ """
+ for i in self.descendants(idx):
+ self.delete_node(i)
+
+ def delete_node(self,i):
+ del self.node[i], self.word[i], self.tag[i], self.lemma[i], self.rel[i], self.dep[i]
+ if i in self.children:
+ del self.children[i]
+
+ def get_plain_text(self):
+ """Output plain-text sentence.
+ """
+ text = ' '.join([self.word[i] for i in sorted(self.node)])
+ # remove spaces in front of commas, etc
+ for i in ',.:;!?':
+ text = text.replace(' ' + i, i)
+ return text
+
+ def least_common_node(self,n,m):
+ """Return a node that is least common for two given nodes,
+ as well as the shortest path between the two nodes
+ @param n: index of node 1
+ @param m: index of node 2
+ """
+
+ common_node = None
+ shortest_path = []
+ path1 = self.path2root(m)
+ path2 = self.path2root(n)
+
+ for i in path1:
+ if common_node != None: break
+ for j in path2:
+ if i == j:
+ common_node = i
+ break
+
+ if common_node != None:
+ for i in path1:
+ shortest_path.append(i)
+ if i == common_node: break
+ for i in path2:
+ if i == common_node: break
+ shortest_path.append(i)
+
+ return common_node, shortest_path
+
+ def path2root(self, i):
+ """The path to the root from a node.
+ @param i: the index of the node
+ """
+ path = [i]
+ if i != 0:
+ while 1:
+ p = self.dep.get(i)
+ if not p: break
+ path.append(p)
+ i = p
+ return path
+
+ def print_table(self):
+ """Print the parse as a table, FDG-style, to STDOUT
+ """
+ for i in sorted(self.word):
+ line = '\t'.join([
+ self.word.get(i,'')
+ self.lemma.get(i,'')
+ self.tag.get(i,'')
+ self.rel.get(i,'')
+ self.dep.get(i,'')
+ ])
+ print line.encode('utf8')
+
+ def print_tree(self, mode='penn'):
+ """Prints the parse.
+ @param mode: penn/typedDependenciesCollapsed/etc
+ """
+ tp = TreePrint(mode)
+ tp.printTree(self.parse)
+
+class StanfordParser:
+
+ TAG = re.compile(r'<[^>]+>')
+
+ def __init__(self, parser_file,
+ parser_options=['-maxLength', '80', '-retainTmpSubcategories']):
+ """@param parser_file: path to the serialised parser model (e.g. englishPCFG.ser.gz)
+ @param parser_options: options
+ """
+ self.lp = LexicalizedParser(parser_file)
+ self.lp.setOptionFlags(parser_options)
+ tlp = PennTreebankLanguagePack()
+ self.gsf = tlp.grammaticalStructureFactory()
+ self.wtf = WordTokenFactory()
+
+ def parse(self, s):
+ """Strips XML tags first.
+ @param s: the sentence to be parsed, as a string
+ @return: a Sentence object
+ """
+ # strip xml tags
+ s = self.TAG.sub('', s)
+
+ parse = self.lp.apply(s)
+ return Sentence(self.gsf, parse)
+
+ def parse_xml(self,s):
+ """Tokenise the XML text, remember XML positions, and then parse it.
+ """
+
+ # tokenise the text
+ r = StringReader(s)
+ tokeniser = PTBTokenizer(r, False, self.wtf)
+ alist = tokeniser.tokenize()
+
+ # build a plain-text token list and remember tag positions
+ tags = {}
+ sent = []
+ for i in alist:
+ token = str(i)
+ if token.startswith('<'):
+ cur_size = len(sent)
+ tags[cur_size] = tags.get(cur_size,[])
+ tags[cur_size].append(token)
+ else:
+ sent.append(token)
+
+ # parse
+ parse = self.lp.apply(Arrays.asList(sent))
+
+ return Sentence(self.gsf, parse, tags)
+
+
+if __name__ == '__main__':
+
+ sp = StanfordParser(r'C:\soft\stanford\stanford-parser-2008-10-26\englishPCFG.ser.gz')
+
+ print 'Parsing XML text\n'
+ s = 'This is an <tag attr="term">example<!-- this is a comment --></tag>.'
+ print 'IN:', s
+ sentence = sp.parse_xml(s)
+ print 'OUT:'
+ sentence.print_table()
+ print '-'*80
+
+ print 'Output formats\n'
+ s = 'This is an example sentence.'
+ print 'IN:', s
+ sentence = sp.parse_xml(s)
+ print 'TABLE:'
+ sentence.print_table()
+ print '\nTREE:'
+ sentence.print_tree()
+ print '\nTT FORMAT:'
+ for i in stanford2tt(sentence):
+ print i
+ print '-'*80
+
+ print 'Parse probabilities\n'
+ s = 'I saw a man with a telescope.'
+ print 'IN:', s
+ for candidate_tree in sp.lp.getKBestPCFGParses(1):
+ print 'Probability:', math.e**candidate_tree.score()
+ print 'Tree:'
+ s = Sentence(sp.gsf, candidate_tree.object())
+ s.print_table()
+ print '-'*50
+ print '-'*80
+
+ """
+ print
+ print 'Subtrees:\n'
+ for subtree in sentence.parse.subTrees():
+ print subtree
+ print '-'*50
+ print '-'*80
+ """
+
+ print 'Dependencies\n'
+ for td in sentence.gs.allTypedDependencies():
+ gov = td.gov()
+ gov_idx = gov.index()
+ dep = td.dep()
+ dep_idx = dep.index()
+ rel = td.reln()
+ print 'Governing word:',gov.value()
+ print 'Its index:',gov_idx
+ print 'Dependency word:',dep.value()
+ print 'Its index:',dep_idx
+ print '-'*50
+ print '-'*80
+
+ """
+ # paths between every pair of content words
+ content = []
+ for i in sentence.gs.getNodes():
+ if i.headTagNode() != None: continue
+ idx = i.index()
+ word = i.value()
+ tag = i.parent().value()
+ if tag[0] in ['V','N','J','R']:
+ content.append(i)
+ for i in content:
+ for j in content:
+ if i == j: continue
+ lcn, shortest_path = sentence.least_common_node(i.index(), j.index())
+ print 'LCN: %s and %s: %s' % (i, j, lcn)
+ print 'Path:', shortest_path
+ print '-'*50
+ """
+

0 comments on commit d7b28cc

Please sign in to comment.