Skip to content
Newer
Older
100644 402 lines (332 sloc) 12.9 KB
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
1 """A Jython interface to the Stanford parser (v.2.0.3). Includes various utilities
2 to manipulate parsed sentences:
3 * parse text containing XML tags,
4 * obtain probabilities for different analyses,
5 * extract dependency relations,
6 * extract subtrees,
7 * find the shortest path between two nodes,
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
8 * print the parse in various formats.
9
10 See examples after the if __name__ == "__main__" hooks.
11
a06e481 @vpekar First commit on hotfix
authored Sep 14, 2012
12
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
13 INSTALLATION:
14
15 1. Download the parser from http://nlp.stanford.edu/downloads/lex-parser.shtml
8d60fdb @vpekar m
authored Sep 14, 2012
16 2. Unpack into a local dir, put the path to stanford-parser.jar into the classpath for jython
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
17 3. Put the full path to englishPCFG.ser.gz as parser_file arg to StanfordParser
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
18
b4ffc29 @vpekar The second commit
authored Sep 14, 2012
19 USAGE:
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
20
e825b95 @vpekar m
authored Sep 14, 2012
21 Initialize a parser:
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
22
8d60fdb @vpekar m
authored Sep 14, 2012
23 parser = StanfordParser('englishPCFG.ser.gz')
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
24
25 To keep XML tags provided in the input text:
26
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
27 sentence = parser.parse('This is a <tag>test</tag>.')
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
28
29 To strip all XML before parsing:
30
31 sentence = parser.parse_xml('This is a <b>test</b>.')
32
33 To print the sentence as a table (one word per line):
34
35 sentence.print_table()
36
37 To print the sentence as a parse tree:
38
39 sentence.print_tree()
40
41 On input, the script accepts unicode or utf8 or latin1.
e825b95 @vpekar m
authored Sep 14, 2012
42
78ea901 @vpekar Third commit
authored Sep 14, 2012
43 On output, the script produces unicode.
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
44 """
45
b943786 @vpekar Second commit
authored Sep 14, 2012
46 __author__="Viktor Pekar <v.pekar@gmail.com>"
47 __version__="0.1"
48
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
49 import sys, re, os, string, math
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
50
51 try:
52 assert 'java' in sys.platform
53 except AssertionError:
54 raise Exception("The script should be run from Jython!")
55
56 from java.util import *
57 from edu.stanford.nlp.trees import PennTreebankLanguagePack, TreePrint
58 from edu.stanford.nlp.parser.lexparser import LexicalizedParser
59 from edu.stanford.nlp.process import Morphology, PTBTokenizer, WordTokenFactory
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
60 from edu.stanford.nlp.parser.lexparser import Options
61 from edu.stanford.nlp.ling import Sentence, WordTag
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
62 from java.io import StringReader
63
64
65 def stanford2tt(sentence):
66 """Given a Sentence object, return TreeTagger-style tuples (word, tag, lemma).
67 """
68 for k in sorted(sentence.word):
69 word = sentence.word.get(k, '')
70 if word.startswith('<'):
71 tag, lemma = 'XML', word
72 else:
73 tag = sentence.tag.get(k, '')
74 lemma = sentence.lemma.get(k, '')
75 # correcting: TO -> IN
76 if word == 'to' and tag == 'TO':
77 tag = 'IN'
78 yield (word, tag, lemma)
79
80
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
81 class PySentence:
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
82 """An interface to the grammaticalStructure object of SP
83 """
84
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
85 def __init__(self, parser, parse, xmltags={}):
86 """Create a PySentence object from parse.
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
87 @param gsf: a grammaticalStructureFactory object
88 @param parse: a parse of the sentence
89 @param xmltags: index of the previous text token => list of intervening xmltags
90 """
91 self.parse = parse
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
92 self.gs = parser.gsf.newGrammaticalStructure(parse)
93 self.lemmer = parser.lemmer
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
94 self.xmltags = xmltags
95
96 # create indices
97 self.node = {}
98 self.word = {}
99 self.tag = {}
100 self.lemma = {}
101 self.dep = {}
102 self.rel = {}
103 self.children = {}
104
105 # insert the tags before the text, if any are present before the text
106 if 0 in self.xmltags:
107 num_tags = len(self.xmltags[0])
108 for idx in xrange(num_tags):
109 tag_idx = (idx+1)/float(num_tags+1)
110 self.word[tag_idx] = self.xmltags[0][idx].decode('latin1')
111
112 # iterate over text tokens
113 for i in self.gs.getNodes():
114 if i.headTagNode() != None: continue
115 idx = i.index()
116 word = i.value().decode('latin1')
117
118 # correction
119 if word == '-RRB-': word = u'('
120 elif word == '-LRB-': word = u')'
121
122 parent = i.parent()
123 tag = u'Z' if parent == None else parent.value().decode('latin1')
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
124 lemma = self.lemmer.lemmatize(WordTag(word, tag)).lemma().decode('latin1')
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
125 p = self.gs.getGovernor(i)
126 if word in string.punctuation or p == None:
127 p_idx = 0
128 rel = 'punct'
129 else:
130 p_idx = p.index()
131 rel = str(self.gs.getGrammaticalRelation(p_idx, idx))
132
133 self.node[idx] = i
134 self.word[idx] = word
135 self.tag[idx] = tag
136 self.lemma[idx] = lemma
137 self.rel[idx] = rel
138 self.dep[idx] = p_idx
139 self.children[p_idx] = self.children.get(p_idx,[])
140 self.children[p_idx].append( idx )
141
142 # insert xml tags, if any
143 if idx in self.xmltags:
144 num_tags = len(self.xmltags[idx])
145 for t_num in xrange(num_tags):
146 tag_idx = (t_num+1)/float(num_tags+1)
147 self.word[idx+tag_idx] = self.xmltags[idx][t_num].decode('latin1')
148
149 def get_head(self, node):
150 """Return a tuple with the head of the dependency for a node and the
151 relation label.
152 """
153 idx = node.index()
154 dep_idx = self.dep.get(idx)
155 if not dep_idx: return None, None
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
156 return self.node.get(dep_idx), self.rel.get(idx)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
157
158 def get_children(self,node):
159 """Yield tuples each with a child of the dependency
160 and the relation label
161 """
162 for i in self.children.get(node.index(), []):
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
163 yield self.node[i], self.rel[i]
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
164
165 def descendants(self,idx):
166 """Return all descendants of a node, including the node itself
167 """
168 global descendants
169 descendants = [idx]
170 def traverse(idx):
171 global descendants
172 for i in self.children.get(idx, []):
173 descendants.append(i)
174 traverse(i)
175 traverse(idx)
176 return descendants
177
178 def prune(self,idx):
179 """Given an index, remove all the words dependent on the word with that index,
180 including the word itself.
181 """
182 for i in self.descendants(idx):
183 self.delete_node(i)
184
185 def delete_node(self,i):
186 del self.node[i], self.word[i], self.tag[i], self.lemma[i], self.rel[i], self.dep[i]
187 if i in self.children:
188 del self.children[i]
189
190 def get_plain_text(self):
191 """Output plain-text sentence.
192 """
193 text = ' '.join([self.word[i] for i in sorted(self.node)])
194 # remove spaces in front of commas, etc
195 for i in ',.:;!?':
196 text = text.replace(' ' + i, i)
197 return text
198
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
199 def get_least_common_node(self,n,m):
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
200 """Return a node that is least common for two given nodes,
201 as well as the shortest path between the two nodes
202 @param n: index of node 1
203 @param m: index of node 2
204 """
205
206 common_node = None
207 shortest_path = []
208 path1 = self.path2root(m)
209 path2 = self.path2root(n)
210
211 for i in path1:
212 if common_node != None: break
213 for j in path2:
214 if i == j:
215 common_node = i
216 break
217
218 if common_node != None:
219 for i in path1:
220 shortest_path.append(i)
221 if i == common_node: break
222 for i in path2:
223 if i == common_node: break
224 shortest_path.append(i)
225
226 return common_node, shortest_path
227
228 def path2root(self, i):
229 """The path to the root from a node.
230 @param i: the index of the node
231 """
232 path = [i]
233 if i != 0:
234 while 1:
235 p = self.dep.get(i)
236 if not p: break
237 path.append(p)
238 i = p
239 return path
240
241 def print_table(self):
242 """Print the parse as a table, FDG-style, to STDOUT
243 """
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
244 def get_index(s):
245 return '-' if '.' in s else s
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
246 for i in sorted(self.word):
247 line = '\t'.join([
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
248 get_index(unicode(i)),
249 self.word.get(i,''),
250 self.lemma.get(i,''),
251 self.tag.get(i,''),
252 self.rel.get(i,''),
253 unicode(self.dep.get(i,'')),
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
254 ])
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
255 print line.encode('latin1')
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
256
257 def print_tree(self, mode='penn'):
258 """Prints the parse.
259 @param mode: penn/typedDependenciesCollapsed/etc
260 """
261 tp = TreePrint(mode)
262 tp.printTree(self.parse)
263
264 class StanfordParser:
265
266 TAG = re.compile(r'<[^>]+>')
267
268 def __init__(self, parser_file,
269 parser_options=['-maxLength', '80', '-retainTmpSubcategories']):
270 """@param parser_file: path to the serialised parser model (e.g. englishPCFG.ser.gz)
271 @param parser_options: options
272 """
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
273 assert os.path.exists(parser_file)
274 options = Options()
275 options.setOptions(parser_options)
276 self.lp = LexicalizedParser.getParserFromFile(parser_file, options)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
277 tlp = PennTreebankLanguagePack()
278 self.gsf = tlp.grammaticalStructureFactory()
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
279 self.lemmer = Morphology()
280 self.word_token_factory = WordTokenFactory()
281 self.parser_query = None
282
283 def get_most_probable_parses(self, text, kbest=2):
284 """Yield kbest parses of a sentence along with their probabilities.
285 """
286 if not self.parser_query:
287 self.parser_query = self.lp.parserQuery()
533cc47 @vpekar Added tests
authored Nov 5, 2012
288 response = self.parser_query.parse(self.tokenize(text))
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
289 if not response:
290 raise Exception("The sentence was not accepted by the parser: %s" % text)
291 for candidate_tree in self.parser_query.getKBestPCFGParses(kbest):
533cc47 @vpekar Added tests
authored Nov 5, 2012
292 s = PySentence(self, candidate_tree.object())
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
293 prob = math.e**candidate_tree.score()
294 yield s, prob
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
295
296 def parse(self, s):
297 """Strips XML tags first.
298 @param s: the sentence to be parsed, as a string
299 @return: a Sentence object
300 """
301 # strip xml tags
302 s = self.TAG.sub('', s)
303
304 parse = self.lp.apply(s)
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
305 return PySentence(self, parse)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
306
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
307 def tokenize(self, text):
308 reader = StringReader(text)
309 tokeniser = PTBTokenizer(reader, self.word_token_factory, None)
310 tokens = tokeniser.tokenize()
311 return tokens
312
313 def parse_xml(self, text):
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
314 """Tokenise the XML text, remember XML positions, and then parse it.
315 """
316
317 # build a plain-text token list and remember tag positions
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
318 xml_tags = {}
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
319 sent = []
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
320 for i in self.tokenize(text):
321 token = unicode(i)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
322 if token.startswith('<'):
323 cur_size = len(sent)
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
324 xml_tags[cur_size] = xml_tags.get(cur_size,[])
325 xml_tags[cur_size].append(token)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
326 else:
327 sent.append(token)
328
329 # parse
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
330 parse = self.lp.apply(Sentence.toWordList(sent))
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
331
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
332 return PySentence(self, parse, xml_tags)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
333
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
334 def parse_xml_example(sp):
335 print 'Parsing XML text'
336 s = 'The quick brown <tag attr="term">fox<!-- this is a comment --></tag> jumped over the lazy dog.'
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
337 print 'IN:', s
338 sentence = sp.parse_xml(s)
339 print 'OUT:'
340 sentence.print_table()
341 print '-'*80
342
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
343 def parse_probabilities_example(sp):
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
344 print 'Parse probabilities\n'
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
345 text = 'I saw a man with a telescope.'
346 print 'IN:', text
347 for s, prob in sp.get_most_probable_parses(text, kbest=2):
348 print 'Probability:', prob
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
349 print 'Tree:'
350 s.print_table()
351 print '-'*50
352 print '-'*80
353
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
354 def subtrees_example(sp):
355 print 'Subtrees:'
356 text = 'I saw a man with a telescope.'
357 sentence = sp.parse(text)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
358 for subtree in sentence.parse.subTrees():
359 print subtree
360 print '-'*50
361 print '-'*80
362
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
363 def get_dependencies_example(sp):
364 print 'Dependencies:'
365 text = 'I saw a man with a telescope.'
366 sentence = sp.parse(text)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
367 for td in sentence.gs.allTypedDependencies():
368 gov = td.gov()
369 gov_idx = gov.index()
370 dep = td.dep()
371 dep_idx = dep.index()
372 rel = td.reln()
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
373 print 'Head: %s (%d); dependent: %s (%d); relation: %s' % (gov.value(), gov_idx, dep.value(), dep_idx, rel)
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
374 print '-'*80
375
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
376 def get_common_path_example(sp):
377 print 'Common path:'
533cc47 @vpekar Added tests
authored Nov 5, 2012
378 text = 'The quick brown fox jumped over a lazy dog.'
1f19dae @vpekar Upgraded to work with Stanford Parser 2.0
authored Nov 4, 2012
379 print 'Text:', text
380 i = 4
381 j = 9
382 sentence = sp.parse(text)
383 lcn, shortest_path = sentence.get_least_common_node(i, j)
384 print 'Least common node for "%s" and "%s": "%s"' % (sentence.word[i], sentence.word[j], sentence.word[lcn])
385 path = ' '.join([sentence.word[x] for x in sorted(shortest_path)])
386 print 'Path: %s' % path
387
388
389 if __name__ == '__main__':
390
391 # full path to parser file, e.g. englishPCFG.ser.gz
392 parser_file = sys.argv[1]
393 sp = StanfordParser(parser_file)
394
395 parse_xml_example(sp)
396 parse_probabilities_example(sp)
397 subtrees_example(sp)
398 get_dependencies_example(sp)
399 get_common_path_example(sp)
400
a06e481 @vpekar First commit on hotfix
authored Sep 14, 2012
401
Something went wrong with that request. Please try again.