Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 394 lines (328 sloc) 12.518 kB
d7b28cc @vpekar Initial commit
authored
1 """A Jython interface to the Stanford parser. Includes various utilities to manipulate
2 parsed sentences:
3 * parsing text containing XML tags,
4 * obtaining probabilities for different analyses,
5 * extracting dependency relations,
6 * extracting subtrees,
7 * finding the shortest path between two nodes,
8 * print the parse in various formats.
9
10 See examples after the if __name__ == "__main__" hooks.
11
12 INSTALLATION:
13
14 1. Download the parser from http://nlp.stanford.edu/downloads/lex-parser.shtml
15 2. Unpack into a local dir, put the path to stanford-parser.jar in the -cp arg in jython.bat
16 3. Put the path to englishPCFG.ser.gz as parser_file arg to StanfordParser
17
18 USAGE:
19
20 1. Produce an FDG-style of a parse (a table as a list of words with tags):
21
22 parser = StanfordParser()
23
24 To keep XML tags provided in the input text:
25
26 sentence = parser.parse('This is a test')
27
28 To strip all XML before parsing:
29
30 sentence = parser.parse_xml('This is a <b>test</b>.')
31
32 To print the sentence as a table (one word per line):
33
34 sentence.print_table()
35
36 To print the sentence as a parse tree:
37
38 sentence.print_tree()
39
40 2. Retrieve the 5 best parses with associated probabilities for the last-parsed sentence:
41
42 parser = StanfordParser()
43 sentence = parser.parse('This is a test')
44 for candidate_tree in parser.lp.getKBestPCFGParses(5):
45 print 'Prob:', math.e**candidate_tree.score()
46 print 'Tree:'
47 s = Sentence(parser.gsf, candidate_tree.object())
48 s.print_table()
49
50 On input, the script accepts unicode or utf8 or latin1.
51 On output, the script produces unicode.
52 """
53
54 import sys, re, string, math
55
56 try:
57 assert 'java' in sys.platform
58 except AssertionError:
59 raise Exception("The script should be run from Jython!")
60
61 from java.util import *
62 from edu.stanford.nlp.trees import PennTreebankLanguagePack, TreePrint
63 from edu.stanford.nlp.parser.lexparser import LexicalizedParser
64 from edu.stanford.nlp.process import Morphology, PTBTokenizer, WordTokenFactory
65 from java.io import StringReader
66
67
68 def stanford2tt(sentence):
69 """Given a Sentence object, return TreeTagger-style tuples (word, tag, lemma).
70 """
71 for k in sorted(sentence.word):
72 word = sentence.word.get(k, '')
73 if word.startswith('<'):
74 tag, lemma = 'XML', word
75 else:
76 tag = sentence.tag.get(k, '')
77 lemma = sentence.lemma.get(k, '')
78 # correcting: TO -> IN
79 if word == 'to' and tag == 'TO':
80 tag = 'IN'
81 yield (word, tag, lemma)
82
83
84 class Sentence:
85 """An interface to the grammaticalStructure object of SP
86 """
87
88 def __init__(self, gsf, parse, xmltags={}):
89 """Create a Sentence object from parse.
90 @param gsf: a grammaticalStructureFactory object
91 @param parse: a parse of the sentence
92 @param xmltags: index of the previous text token => list of intervening xmltags
93 """
94 self.parse = parse
95 self.gs = gsf.newGrammaticalStructure(parse)
96 self.lemmer = Morphology()
97 self.xmltags = xmltags
98
99 # create indices
100 self.node = {}
101 self.word = {}
102 self.tag = {}
103 self.lemma = {}
104 self.dep = {}
105 self.rel = {}
106 self.children = {}
107
108 # insert the tags before the text, if any are present before the text
109 if 0 in self.xmltags:
110 num_tags = len(self.xmltags[0])
111 for idx in xrange(num_tags):
112 tag_idx = (idx+1)/float(num_tags+1)
113 self.word[tag_idx] = self.xmltags[0][idx].decode('latin1')
114
115 # iterate over text tokens
116 for i in self.gs.getNodes():
117 if i.headTagNode() != None: continue
118 idx = i.index()
119 word = i.value().decode('latin1')
120
121 # correction
122 if word == '-RRB-': word = u'('
123 elif word == '-LRB-': word = u')'
124
125 parent = i.parent()
126 tag = u'Z' if parent == None else parent.value().decode('latin1')
127 lemma = self.lemmer.lemmatize(self.lemmer.stem(word, tag)).lemma().decode('latin1')
128 p = self.gs.getGovernor(i)
129 if word in string.punctuation or p == None:
130 p_idx = 0
131 rel = 'punct'
132 else:
133 p_idx = p.index()
134 rel = str(self.gs.getGrammaticalRelation(p_idx, idx))
135
136 self.node[idx] = i
137 self.word[idx] = word
138 self.tag[idx] = tag
139 self.lemma[idx] = lemma
140 self.rel[idx] = rel
141 self.dep[idx] = p_idx
142 self.children[p_idx] = self.children.get(p_idx,[])
143 self.children[p_idx].append( idx )
144
145 # insert xml tags, if any
146 if idx in self.xmltags:
147 num_tags = len(self.xmltags[idx])
148 for t_num in xrange(num_tags):
149 tag_idx = (t_num+1)/float(num_tags+1)
150 self.word[idx+tag_idx] = self.xmltags[idx][t_num].decode('latin1')
151
152 def get_head(self, node):
153 """Return a tuple with the head of the dependency for a node and the
154 relation label.
155 """
156 idx = node.index()
157 dep_idx = self.dep.get(idx)
158 if not dep_idx: return None, None
159 return (self.node.get(dep_idx), self.rel.get(idx))
160
161 def get_children(self,node):
162 """Yield tuples each with a child of the dependency
163 and the relation label
164 """
165 for i in self.children.get(node.index(), []):
166 yield (self.node[i], self.rel[i])
167
168 def descendants(self,idx):
169 """Return all descendants of a node, including the node itself
170 """
171 global descendants
172 descendants = [idx]
173 def traverse(idx):
174 global descendants
175 for i in self.children.get(idx, []):
176 descendants.append(i)
177 traverse(i)
178 traverse(idx)
179 return descendants
180
181 def prune(self,idx):
182 """Given an index, remove all the words dependent on the word with that index,
183 including the word itself.
184 """
185 for i in self.descendants(idx):
186 self.delete_node(i)
187
188 def delete_node(self,i):
189 del self.node[i], self.word[i], self.tag[i], self.lemma[i], self.rel[i], self.dep[i]
190 if i in self.children:
191 del self.children[i]
192
193 def get_plain_text(self):
194 """Output plain-text sentence.
195 """
196 text = ' '.join([self.word[i] for i in sorted(self.node)])
197 # remove spaces in front of commas, etc
198 for i in ',.:;!?':
199 text = text.replace(' ' + i, i)
200 return text
201
202 def least_common_node(self,n,m):
203 """Return a node that is least common for two given nodes,
204 as well as the shortest path between the two nodes
205 @param n: index of node 1
206 @param m: index of node 2
207 """
208
209 common_node = None
210 shortest_path = []
211 path1 = self.path2root(m)
212 path2 = self.path2root(n)
213
214 for i in path1:
215 if common_node != None: break
216 for j in path2:
217 if i == j:
218 common_node = i
219 break
220
221 if common_node != None:
222 for i in path1:
223 shortest_path.append(i)
224 if i == common_node: break
225 for i in path2:
226 if i == common_node: break
227 shortest_path.append(i)
228
229 return common_node, shortest_path
230
231 def path2root(self, i):
232 """The path to the root from a node.
233 @param i: the index of the node
234 """
235 path = [i]
236 if i != 0:
237 while 1:
238 p = self.dep.get(i)
239 if not p: break
240 path.append(p)
241 i = p
242 return path
243
244 def print_table(self):
245 """Print the parse as a table, FDG-style, to STDOUT
246 """
247 for i in sorted(self.word):
248 line = '\t'.join([
249 self.word.get(i,'')
250 self.lemma.get(i,'')
251 self.tag.get(i,'')
252 self.rel.get(i,'')
253 self.dep.get(i,'')
254 ])
255 print line.encode('utf8')
256
257 def print_tree(self, mode='penn'):
258 """Prints the parse.
259 @param mode: penn/typedDependenciesCollapsed/etc
260 """
261 tp = TreePrint(mode)
262 tp.printTree(self.parse)
263
264 class StanfordParser:
265
266 TAG = re.compile(r'<[^>]+>')
267
268 def __init__(self, parser_file,
269 parser_options=['-maxLength', '80', '-retainTmpSubcategories']):
270 """@param parser_file: path to the serialised parser model (e.g. englishPCFG.ser.gz)
271 @param parser_options: options
272 """
273 self.lp = LexicalizedParser(parser_file)
274 self.lp.setOptionFlags(parser_options)
275 tlp = PennTreebankLanguagePack()
276 self.gsf = tlp.grammaticalStructureFactory()
277 self.wtf = WordTokenFactory()
278
279 def parse(self, s):
280 """Strips XML tags first.
281 @param s: the sentence to be parsed, as a string
282 @return: a Sentence object
283 """
284 # strip xml tags
285 s = self.TAG.sub('', s)
286
287 parse = self.lp.apply(s)
288 return Sentence(self.gsf, parse)
289
290 def parse_xml(self,s):
291 """Tokenise the XML text, remember XML positions, and then parse it.
292 """
293
294 # tokenise the text
295 r = StringReader(s)
296 tokeniser = PTBTokenizer(r, False, self.wtf)
297 alist = tokeniser.tokenize()
298
299 # build a plain-text token list and remember tag positions
300 tags = {}
301 sent = []
302 for i in alist:
303 token = str(i)
304 if token.startswith('<'):
305 cur_size = len(sent)
306 tags[cur_size] = tags.get(cur_size,[])
307 tags[cur_size].append(token)
308 else:
309 sent.append(token)
310
311 # parse
312 parse = self.lp.apply(Arrays.asList(sent))
313
314 return Sentence(self.gsf, parse, tags)
315
316
317 if __name__ == '__main__':
318
319 sp = StanfordParser(r'C:\soft\stanford\stanford-parser-2008-10-26\englishPCFG.ser.gz')
320
321 print 'Parsing XML text\n'
322 s = 'This is an <tag attr="term">example<!-- this is a comment --></tag>.'
323 print 'IN:', s
324 sentence = sp.parse_xml(s)
325 print 'OUT:'
326 sentence.print_table()
327 print '-'*80
328
329 print 'Output formats\n'
330 s = 'This is an example sentence.'
331 print 'IN:', s
332 sentence = sp.parse_xml(s)
333 print 'TABLE:'
334 sentence.print_table()
335 print '\nTREE:'
336 sentence.print_tree()
337 print '\nTT FORMAT:'
338 for i in stanford2tt(sentence):
339 print i
340 print '-'*80
341
342 print 'Parse probabilities\n'
343 s = 'I saw a man with a telescope.'
344 print 'IN:', s
345 for candidate_tree in sp.lp.getKBestPCFGParses(1):
346 print 'Probability:', math.e**candidate_tree.score()
347 print 'Tree:'
348 s = Sentence(sp.gsf, candidate_tree.object())
349 s.print_table()
350 print '-'*50
351 print '-'*80
352
353 """
354 print
355 print 'Subtrees:\n'
356 for subtree in sentence.parse.subTrees():
357 print subtree
358 print '-'*50
359 print '-'*80
360 """
361
362 print 'Dependencies\n'
363 for td in sentence.gs.allTypedDependencies():
364 gov = td.gov()
365 gov_idx = gov.index()
366 dep = td.dep()
367 dep_idx = dep.index()
368 rel = td.reln()
369 print 'Governing word:',gov.value()
370 print 'Its index:',gov_idx
371 print 'Dependency word:',dep.value()
372 print 'Its index:',dep_idx
373 print '-'*50
374 print '-'*80
375
376 """
377 # paths between every pair of content words
378 content = []
379 for i in sentence.gs.getNodes():
380 if i.headTagNode() != None: continue
381 idx = i.index()
382 word = i.value()
383 tag = i.parent().value()
384 if tag[0] in ['V','N','J','R']:
385 content.append(i)
386 for i in content:
387 for j in content:
388 if i == j: continue
389 lcn, shortest_path = sentence.least_common_node(i.index(), j.index())
390 print 'LCN: %s and %s: %s' % (i, j, lcn)
391 print 'Path:', shortest_path
392 print '-'*50
393 """
394
Something went wrong with that request. Please try again.