/
get_corefering_predicates.py
291 lines (222 loc) · 12.1 KB
/
get_corefering_predicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import codecs
from docopt import docopt
from fuzzywuzzy import fuzz
from spacy.en import English
from num2words import num2words
from collections import defaultdict
from nltk.corpus import wordnet as wn
from guess_language import guessLanguage
nlp = English()
MAX_PRED_FOR_ARG_PAIR = 5
def main():
"""
Receives a file with propositions extracted from same event tweets,
and aligns predicates / arguments based on string matching / WordNET synset matching of two
items for the proposition.
"""
args = docopt("""Receives a file with propositions extracted from same event tweets,
and aligns predicates / arguments based on string matching / WordNET synset matching of two
items for the proposition.
Usage:
get_corefering_predicates.py <tweets_file> <out_file>
<tweets_file> = the file containing the propositions from tweets discussing
the same event, each line in the format: confidence\tpredicate\targ1\targ2\ttweet
<out_file> = the output file, that will contain the positive instances.
""")
tweets_file = args['<tweets_file>']
out_file = args['<out_file>']
# Load a list of pronouns
with codecs.open('pronouns.txt', 'r', 'utf-8') as f_in:
pronouns = set([line.strip() for line in f_in])
# Load the propositions file
with codecs.open(tweets_file, 'r', 'utf-8', errors='replace') as f_in:
propositions = [tuple(line.lower().strip().split('\t')) for line in f_in]
# Unite consecutive arguments, e.g. "US government"
for i, prop in enumerate(propositions):
if len(prop) >= 9:
tweet_id, sent, sf_pred, lemmatized_pred, _, a0, _, a1, _, a2 = prop[:10]
if '{a0} {a1}' in lemmatized_pred:
sf_pred = sf_pred.replace('{a0} {a1}', '{a0}').replace('{a2}', '{a1}')
lemmatized_pred = lemmatized_pred.replace('{a0} {a1}', '{a0}').replace('{a2}', '{a1}')
a0 = a0 + ' ' + a1
a1 = a2
propositions[i] = (tweet_id, sent, sf_pred, lemmatized_pred, 'a0', a0, 'a1', a1)
elif '{a1} {a2}' in lemmatized_pred:
sf_pred = sf_pred.replace('{a1} {a2}', '{a1}')
lemmatized_pred = lemmatized_pred.replace('{a1} {a2}', '{a1}')
a1 = a1 + ' ' + a2
propositions[i] = (tweet_id, sent, sf_pred, lemmatized_pred, 'a0', a0, 'a1', a1)
# 0 - tweet_id, 1 - sentence, 2 - predicate, 3 - lemmatized predicate, 4 - "A0", 5 - A0, 6 - "A1", 7 - A1
propositions = [(item[0], item[1], item[2][:item[2].index('{a1}') + 4].strip(),
item[3][:item[3].index('{a1}') + 4].strip(), item[5], item[7])
for item in propositions if len(item) >= 8 and '{a1}' in item[2]]
# Remove non English sentences, those with short arguments and trivial / too general predicates
propositions = [(tweet_id, sent, surface_pred, pred, a0, a1) for (tweet_id, sent, surface_pred, pred, a0, a1)
in propositions if guessLanguage(sent) == 'en' and len(a0) >= 2 and len(a1) >= 2
and pred not in ['{a0} {a1}', '{a1} {a0}', '{a0} be {a1}', '{a0} be {a1}']]
# Find predicates that match by argument
predicate_alignments = pair_aligned_propositions(propositions, pronouns)
# Keep one tweet id pair and one (s,v,o) tuple for each instance
filtered = {tuple(sorted([tweet_id1, tweet_id2])):
(tweet_id1, sent1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sent2, sf_pred2, pred2, sent2_a0, sent2_a1)
for (tweet_id1, sent1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sent2, sf_pred2, pred2, sent2_a0, sent2_a1)
in predicate_alignments}.values()
filtered = {(tuple([pred1, sent1_a0, sent1_a1]), tuple([pred2, sent2_a0, sent2_a1])):
(tweet_id1, sent1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sent2, sf_pred2, pred2, sent2_a0, sent2_a1)
for (tweet_id1, sent1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sent2, sf_pred2, pred2, sent2_a0, sent2_a1)
in filtered}.values()
print 'Extracted %d instances' % len(filtered)
# Write the predicate alignments to the output file
out_file = out_file.replace('.prop', '')
if len(filtered) > 0:
with codecs.open(out_file, 'w', 'utf-8') as f_out:
for prop_pair in filtered:
try:
date = tweets_file[tweets_file.index('/') + 1 if '/' in tweets_file else 0:]
print >> f_out, date + '\t' + '\t'.join(prop_pair)
except:
print 'error'
def get_candidate_pairs(propositions, pronouns):
global nlp
# Remove propositions with argument pronouns
propositions = [(tweet_id, sent, sf_pred, pred, a0, a1) for (tweet_id, sent, sf_pred, pred, a0, a1) in propositions
if len(pronouns.intersection(set([a0, a1]))) == 0 and len(sent) > 10]
print 'Extracted %d propositions' % len(propositions)
# Get candidates by lexical overlap in arguments
candidates_by_args = defaultdict(set)
[candidates_by_args[w].add(i) for i, (tweet_id, sent, sf_pred, pred, a0, a1) in enumerate(propositions)
for w in a0.split() if not nlp.is_stop(w)]
[candidates_by_args[w].add(i) for i, (tweet_id, sent, sf_pred, pred, a0, a1) in enumerate(propositions)
for w in a1.split() if not nlp.is_stop(w)]
# Get pairwise candidates from all lists
candidates = set([propositions[i] + propositions[j] for lst in candidates_by_args.values()
for i in lst for j in lst if i != j])
print 'Extracted %d candidates' % len(candidates)
return candidates
def pair_aligned_propositions(propositions, pronouns):
"""
Align predicates with the same arguments in different sentences
:param propositions: the (sent, pred, arg1, arg2) tuples
:return: a list of aligned_prop
"""
predicate_alignments = []
candidates = get_candidate_pairs(propositions, pronouns)
for (tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1, tweet_id2, sent2, sf_pred2, pred2, s1_a0, s1_a1) in candidates:
# Same tweet
if fuzz.token_sort_ratio(sent1, sent2) >= 95:
continue
# Same predicates?
if is_eq_preds(pred1, pred2):
continue
# Same arguments?
is_eq_a0_a0, is_eq_a1_a1, is_eq_a0_a1, is_eq_a1_a0 = \
is_eq_arg(s0_a0, s1_a0), is_eq_arg(s0_a1, s1_a1), is_eq_arg(s0_a0, s1_a1), is_eq_arg(s0_a1, s1_a0)
# Are arguments aligned?
is_aligned_a0_a0 = is_eq_a0_a0 or is_aligned_arg(s0_a0, s1_a0)
is_aligned_a1_a1 = is_eq_a1_a1 or is_aligned_arg(s0_a1, s1_a1)
is_aligned_a0_a1 = is_eq_a0_a1 or is_aligned_arg(s0_a0, s1_a1)
is_aligned_a1_a0 = is_eq_a1_a0 or is_aligned_arg(s0_a1, s1_a0)
# Are predicates aligned?
is_aligned_pred = is_aligned_preds(pred1, pred2)
# 1) the predicates are not equal, one argument-pair is aligned/equal, the other argument-pair is equal =>
# predicates are aligned
if (is_eq_a0_a0 and is_aligned_a1_a1) or (is_aligned_a0_a0 and is_eq_a1_a1):
predicate_alignments.append((tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1,
tweet_id2, sent2, sf_pred2, pred2, s1_a0, s1_a1))
continue
# 2) all three items are aligned
if is_aligned_pred and is_aligned_a0_a0 and is_aligned_a1_a1:
predicate_alignments.append((tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1,
tweet_id2, sent2, sf_pred2, pred2, s1_a0, s1_a1))
continue
# Same as before, but with reversed arguments
if (is_eq_a0_a1 and is_aligned_a1_a0) or (is_aligned_a0_a1 and is_eq_a1_a0):
new_pred2 = pred2.replace('{a0}', 'ARG0').replace('{a1}', '{a0}').replace('ARG0', '{a1}')
new_sf_pred2 = sf_pred2.replace('{a0}', 'ARG0').replace('{a1}', '{a0}').replace('ARG0', '{a1}')
predicate_alignments.append((tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1,
tweet_id2, sent2, new_sf_pred2, new_pred2, s1_a1, s1_a0))
continue
if is_aligned_pred and is_aligned_a0_a1 and is_aligned_a1_a0:
new_pred2 = pred2.replace('{a0}', 'ARG0').replace('{a1}', '{a0}').replace('ARG0', '{a1}')
new_sf_pred2 = sf_pred2.replace('{a0}', 'ARG0').replace('{a1}', '{a0}').replace('ARG0', '{a1}')
predicate_alignments.append((tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1,
tweet_id2, sent2, new_sf_pred2, new_pred2, s1_a1, s1_a0))
continue
return predicate_alignments
def is_eq_arg(x, y):
"""
Return whether these two words are equal, with fuzzy string matching.
:param x: the first argument
:param y: the second argument
:return: Whether they are equal
"""
if fuzz.ratio(x, y) >= 90:
return True
# Convert numbers to words
x_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in x.split()]
y_words = [num2words(int(w)).replace('-', ' ') if w.isdigit() else w for w in y.split()]
# Partial entailment with equivalence, e.g. 'two girls' -> 'two kids':
return fuzz.ratio(' '.join(x_words), ' '.join(y_words)) >= 85
def is_eq_preds(p1, p2):
"""
Return whether these two predicates are equal, with fuzzy string matching.
:param x: the first predicate
:param y: the second predicate
:return: Whether they are equal
"""
global nlp
# Levenshtein distance mostly
if fuzz.ratio(p1, p2) >= 90:
return True
# Same verb
if p1.replace('{a0} ', '{a0} be ') == p2 or p1.replace('{a0} ', '{a0} have ') == p2 or \
p2.replace('{a0} ', '{a0} be ') == p1 or p2.replace('{a0} ', '{a0} have ') == p1:
return True
return False
def is_aligned_preds(x, y):
"""
Return whether these two words are aligned: they occur in the same WordNet synset.
:param x: the first argument
:param y: the second argument
:return: Whether they are aligned
"""
global nlp
x_synonyms = set([lemma.lower().replace('_', ' ') for synset in wn.synsets(x) for lemma in synset.lemma_names()])
y_synonyms = set([lemma.lower().replace('_', ' ') for synset in wn.synsets(y) for lemma in synset.lemma_names()])
return len([w for w in x_synonyms.intersection(y_synonyms) if not nlp.is_stop(w)]) > 0
def is_aligned_arg(x, y):
"""
Return whether these two arguments are aligned: they occur in the same WordNet synset.
:param x: the first argument
:param y: the second argument
:return: Whether they are aligned
"""
global nlp
# Allow partial matching
if fuzz.partial_ratio(' ' + x + ' ', ' ' + y + ' ') == 100:
return True
x_words = [w for w in x.split() if not nlp.is_stop(w)]
y_words = [w for w in y.split() if not nlp.is_stop(w)]
if len(x_words) == 0 or len(y_words) == 0:
return False
x_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
for w in x_words]
y_synonyms = [set([lemma.lower().replace('_', ' ') for synset in wn.synsets(w) for lemma in synset.lemma_names()])
for w in y_words]
# One word - check whether there is intersection between synsets
if len(x_synonyms) == 1 and len(y_synonyms) == 1 and \
len([w for w in x_synonyms[0].intersection(y_synonyms[0]) if not nlp.is_stop(w)]) > 0:
return True
# More than one word - align words from x with words from y
intersections = [len([w for w in s1.intersection(s2) if not nlp.is_stop(w)])
for s1 in x_synonyms for s2 in y_synonyms]
if len([intersection_len for intersection_len in intersections if intersection_len > 0]) >= \
0.75 * max(len(x_synonyms), len(y_synonyms)):
return True
return False
if __name__ == '__main__':
main()