Skip to content
Newer
Older
100644 390 lines (322 sloc) 11.9 KB
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
1 """A Jython interface to the Stanford parser. Includes various utilities to manipulate
2 parsed sentences:
3 * parsing text containing XML tags,
4 * obtaining probabilities for different analyses,
5 * extracting dependency relations,
6 * extracting subtrees,
7 * finding the shortest path between two nodes,
8 * print the parse in various formats.
9
10 See examples after the if __name__ == "__main__" hooks.
11
a06e481 @vpekar First commit on hotfix
authored Sep 14, 2012
12
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
13 INSTALLATION:
14
15 1. Download the parser from http://nlp.stanford.edu/downloads/lex-parser.shtml
16 2. Unpack into a local dir, put the path to stanford-parser.jar in the -cp arg in jython.bat
17 3. Put the path to englishPCFG.ser.gz as parser_file arg to StanfordParser
18
b4ffc29 @vpekar The second commit
authored Sep 14, 2012
19 USAGE:
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
20
e825b95 @vpekar m
authored Sep 14, 2012
21 Initialize a parser:
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
22
23 parser = StanfordParser()
24
25 To keep XML tags provided in the input text:
26
27 sentence = parser.parse('This is a test')
28
29 To strip all XML before parsing:
30
31 sentence = parser.parse_xml('This is a <b>test</b>.')
32
33 To print the sentence as a table (one word per line):
34
35 sentence.print_table()
36
37 To print the sentence as a parse tree:
38
39 sentence.print_tree()
40
41 On input, the script accepts unicode or utf8 or latin1.
e825b95 @vpekar m
authored Sep 14, 2012
42
78ea901 @vpekar Third commit
authored Sep 14, 2012
43 On output, the script produces unicode.
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
44 """
45
b943786 @vpekar Second commit
authored Sep 14, 2012
46 __author__="Viktor Pekar <v.pekar@gmail.com>"
47 __version__="0.1"
48
d7b28cc @vpekar Initial commit
authored Sep 14, 2012
49 import sys, re, string, math
50
51 try:
52 assert 'java' in sys.platform
53 except AssertionError:
54 raise Exception("The script should be run from Jython!")
55
56 from java.util import *
57 from edu.stanford.nlp.trees import PennTreebankLanguagePack, TreePrint
58 from edu.stanford.nlp.parser.lexparser import LexicalizedParser
59 from edu.stanford.nlp.process import Morphology, PTBTokenizer, WordTokenFactory
60 from java.io import StringReader
61
62
63 def stanford2tt(sentence):
64 """Given a Sentence object, return TreeTagger-style tuples (word, tag, lemma).
65 """
66 for k in sorted(sentence.word):
67 word = sentence.word.get(k, '')
68 if word.startswith('<'):
69 tag, lemma = 'XML', word
70 else:
71 tag = sentence.tag.get(k, '')
72 lemma = sentence.lemma.get(k, '')
73 # correcting: TO -> IN
74 if word == 'to' and tag == 'TO':
75 tag = 'IN'
76 yield (word, tag, lemma)
77
78
79 class Sentence:
80 """An interface to the grammaticalStructure object of SP
81 """
82
83 def __init__(self, gsf, parse, xmltags={}):
84 """Create a Sentence object from parse.
85 @param gsf: a grammaticalStructureFactory object
86 @param parse: a parse of the sentence
87 @param xmltags: index of the previous text token => list of intervening xmltags
88 """
89 self.parse = parse
90 self.gs = gsf.newGrammaticalStructure(parse)
91 self.lemmer = Morphology()
92 self.xmltags = xmltags
93
94 # create indices
95 self.node = {}
96 self.word = {}
97 self.tag = {}
98 self.lemma = {}
99 self.dep = {}
100 self.rel = {}
101 self.children = {}
102
103 # insert the tags before the text, if any are present before the text
104 if 0 in self.xmltags:
105 num_tags = len(self.xmltags[0])
106 for idx in xrange(num_tags):
107 tag_idx = (idx+1)/float(num_tags+1)
108 self.word[tag_idx] = self.xmltags[0][idx].decode('latin1')
109
110 # iterate over text tokens
111 for i in self.gs.getNodes():
112 if i.headTagNode() != None: continue
113 idx = i.index()
114 word = i.value().decode('latin1')
115
116 # correction
117 if word == '-RRB-': word = u'('
118 elif word == '-LRB-': word = u')'
119
120 parent = i.parent()
121 tag = u'Z' if parent == None else parent.value().decode('latin1')
122 lemma = self.lemmer.lemmatize(self.lemmer.stem(word, tag)).lemma().decode('latin1')
123 p = self.gs.getGovernor(i)
124 if word in string.punctuation or p == None:
125 p_idx = 0
126 rel = 'punct'
127 else:
128 p_idx = p.index()
129 rel = str(self.gs.getGrammaticalRelation(p_idx, idx))
130
131 self.node[idx] = i
132 self.word[idx] = word
133 self.tag[idx] = tag
134 self.lemma[idx] = lemma
135 self.rel[idx] = rel
136 self.dep[idx] = p_idx
137 self.children[p_idx] = self.children.get(p_idx,[])
138 self.children[p_idx].append( idx )
139
140 # insert xml tags, if any
141 if idx in self.xmltags:
142 num_tags = len(self.xmltags[idx])
143 for t_num in xrange(num_tags):
144 tag_idx = (t_num+1)/float(num_tags+1)
145 self.word[idx+tag_idx] = self.xmltags[idx][t_num].decode('latin1')
146
147 def get_head(self, node):
148 """Return a tuple with the head of the dependency for a node and the
149 relation label.
150 """
151 idx = node.index()
152 dep_idx = self.dep.get(idx)
153 if not dep_idx: return None, None
154 return (self.node.get(dep_idx), self.rel.get(idx))
155
156 def get_children(self,node):
157 """Yield tuples each with a child of the dependency
158 and the relation label
159 """
160 for i in self.children.get(node.index(), []):
161 yield (self.node[i], self.rel[i])
162
163 def descendants(self,idx):
164 """Return all descendants of a node, including the node itself
165 """
166 global descendants
167 descendants = [idx]
168 def traverse(idx):
169 global descendants
170 for i in self.children.get(idx, []):
171 descendants.append(i)
172 traverse(i)
173 traverse(idx)
174 return descendants
175
176 def prune(self,idx):
177 """Given an index, remove all the words dependent on the word with that index,
178 including the word itself.
179 """
180 for i in self.descendants(idx):
181 self.delete_node(i)
182
183 def delete_node(self,i):
184 del self.node[i], self.word[i], self.tag[i], self.lemma[i], self.rel[i], self.dep[i]
185 if i in self.children:
186 del self.children[i]
187
188 def get_plain_text(self):
189 """Output plain-text sentence.
190 """
191 text = ' '.join([self.word[i] for i in sorted(self.node)])
192 # remove spaces in front of commas, etc
193 for i in ',.:;!?':
194 text = text.replace(' ' + i, i)
195 return text
196
197 def least_common_node(self,n,m):
198 """Return a node that is least common for two given nodes,
199 as well as the shortest path between the two nodes
200 @param n: index of node 1
201 @param m: index of node 2
202 """
203
204 common_node = None
205 shortest_path = []
206 path1 = self.path2root(m)
207 path2 = self.path2root(n)
208
209 for i in path1:
210 if common_node != None: break
211 for j in path2:
212 if i == j:
213 common_node = i
214 break
215
216 if common_node != None:
217 for i in path1:
218 shortest_path.append(i)
219 if i == common_node: break
220 for i in path2:
221 if i == common_node: break
222 shortest_path.append(i)
223
224 return common_node, shortest_path
225
226 def path2root(self, i):
227 """The path to the root from a node.
228 @param i: the index of the node
229 """
230 path = [i]
231 if i != 0:
232 while 1:
233 p = self.dep.get(i)
234 if not p: break
235 path.append(p)
236 i = p
237 return path
238
239 def print_table(self):
240 """Print the parse as a table, FDG-style, to STDOUT
241 """
242 for i in sorted(self.word):
243 line = '\t'.join([
244 self.word.get(i,'')
245 self.lemma.get(i,'')
246 self.tag.get(i,'')
247 self.rel.get(i,'')
248 self.dep.get(i,'')
249 ])
250 print line.encode('utf8')
251
252 def print_tree(self, mode='penn'):
253 """Prints the parse.
254 @param mode: penn/typedDependenciesCollapsed/etc
255 """
256 tp = TreePrint(mode)
257 tp.printTree(self.parse)
258
259 class StanfordParser:
260
261 TAG = re.compile(r'<[^>]+>')
262
263 def __init__(self, parser_file,
264 parser_options=['-maxLength', '80', '-retainTmpSubcategories']):
265 """@param parser_file: path to the serialised parser model (e.g. englishPCFG.ser.gz)
266 @param parser_options: options
267 """
268 self.lp = LexicalizedParser(parser_file)
269 self.lp.setOptionFlags(parser_options)
270 tlp = PennTreebankLanguagePack()
271 self.gsf = tlp.grammaticalStructureFactory()
272 self.wtf = WordTokenFactory()
273
274 def parse(self, s):
275 """Strips XML tags first.
276 @param s: the sentence to be parsed, as a string
277 @return: a Sentence object
278 """
279 # strip xml tags
280 s = self.TAG.sub('', s)
281
282 parse = self.lp.apply(s)
283 return Sentence(self.gsf, parse)
284
285 def parse_xml(self,s):
286 """Tokenise the XML text, remember XML positions, and then parse it.
287 """
288
289 # tokenise the text
290 r = StringReader(s)
291 tokeniser = PTBTokenizer(r, False, self.wtf)
292 alist = tokeniser.tokenize()
293
294 # build a plain-text token list and remember tag positions
295 tags = {}
296 sent = []
297 for i in alist:
298 token = str(i)
299 if token.startswith('<'):
300 cur_size = len(sent)
301 tags[cur_size] = tags.get(cur_size,[])
302 tags[cur_size].append(token)
303 else:
304 sent.append(token)
305
306 # parse
307 parse = self.lp.apply(Arrays.asList(sent))
308
309 return Sentence(self.gsf, parse, tags)
310
311
312 if __name__ == '__main__':
313
314 sp = StanfordParser(r'C:\soft\stanford\stanford-parser-2008-10-26\englishPCFG.ser.gz')
315
316 print 'Parsing XML text\n'
317 s = 'This is an <tag attr="term">example<!-- this is a comment --></tag>.'
318 print 'IN:', s
319 sentence = sp.parse_xml(s)
320 print 'OUT:'
321 sentence.print_table()
322 print '-'*80
323
324 print 'Output formats\n'
325 s = 'This is an example sentence.'
326 print 'IN:', s
327 sentence = sp.parse_xml(s)
328 print 'TABLE:'
329 sentence.print_table()
330 print '\nTREE:'
331 sentence.print_tree()
332 print '\nTT FORMAT:'
333 for i in stanford2tt(sentence):
334 print i
335 print '-'*80
336
337 print 'Parse probabilities\n'
338 s = 'I saw a man with a telescope.'
339 print 'IN:', s
340 for candidate_tree in sp.lp.getKBestPCFGParses(1):
341 print 'Probability:', math.e**candidate_tree.score()
342 print 'Tree:'
343 s = Sentence(sp.gsf, candidate_tree.object())
344 s.print_table()
345 print '-'*50
346 print '-'*80
347
348 """
349 print
350 print 'Subtrees:\n'
351 for subtree in sentence.parse.subTrees():
352 print subtree
353 print '-'*50
354 print '-'*80
355 """
356
357 print 'Dependencies\n'
358 for td in sentence.gs.allTypedDependencies():
359 gov = td.gov()
360 gov_idx = gov.index()
361 dep = td.dep()
362 dep_idx = dep.index()
363 rel = td.reln()
364 print 'Governing word:',gov.value()
365 print 'Its index:',gov_idx
366 print 'Dependency word:',dep.value()
367 print 'Its index:',dep_idx
368 print '-'*50
369 print '-'*80
370
371 """
372 # paths between every pair of content words
373 content = []
374 for i in sentence.gs.getNodes():
375 if i.headTagNode() != None: continue
376 idx = i.index()
377 word = i.value()
378 tag = i.parent().value()
379 if tag[0] in ['V','N','J','R']:
380 content.append(i)
381 for i in content:
382 for j in content:
383 if i == j: continue
384 lcn, shortest_path = sentence.least_common_node(i.index(), j.index())
385 print 'LCN: %s and %s: %s' % (i, j, lcn)
386 print 'Path:', shortest_path
387 print '-'*50
388 """
a06e481 @vpekar First commit on hotfix
authored Sep 14, 2012
389
Something went wrong with that request. Please try again.