-
Notifications
You must be signed in to change notification settings - Fork 0
/
fextract.py
143 lines (132 loc) · 6.64 KB
/
fextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import csv
import re
import nltk
import sys
import myutils
def extract_contents(ent):
for (name, ext) in {'added': lambda x: x['entry']['content']['added'],
'removed': lambda x: x['entry']['content']['removed'],
'comment': lambda x: x['entry']['comment'],}.items():
try:
yield(name, ext(ent))
except KeyError:
print >>sys.stderr, 'no attribute', name, ent
def longest_subsequence(x, y):
l = len(y)
mstart = 0
mcont = 0
start = 0
cont = 0
for (i,v) in enumerate(x):
if i >= l:
break
if v == y[i]:
if cont == 0:
start = i
cont += 1
else:
cont = 0
if cont >= mcont:
mstart = start
mcont = cont
return (mstart, mstart + mcont)
def uncapitalize(str):
(start,end) = longest_subsequence(str, str.upper())
ret = str
if end - start > 50:
ret = str[0:start] + str[start:end].lower() + str[end:]
return ret
class WikiPatternExtractor:
def __init__(self, file='patterns.txt'):
self.patterns = {}
self.patterns_expanded = {}
for line in open(file):
line = myutils.comment_out(line).strip()
if len(line) == 0:
continue
a = line.split('\t')
expand = ''
if len(a) == 3:
expand = a.pop()
name,patt = a
if expand.find('expand') >= 0:
self.patterns_expanded[name] = re.compile(patt)
if expand.find('binary') >= 0 or expand == '':
self.patterns[name] = re.compile(patt)
def extract(self, entry):
ret = {}
for (cname, contents) in extract_contents(entry):
for (pname,pat) in self.patterns.items():
if pat.search(' '.join(contents)):
ret['_'.join([pname,cname])] = True
for (pname,pat) in self.patterns_expanded.items():
for m in pat.finditer(' '.join(contents)):
ret['_'.join([pname,cname]+list(m.groups()))] = True
return ret
def name(self):
return 'WikiPatternExtractor'
class NgramExtractor:
def __init__(self, n=2, lowercase=False):
self.n = n
self.wordsegment = re.compile('[ \{\}\n\\(\)\'>]') # use NLTK segmenter
self.lowercase = lowercase
def extract(self, entry):
ret = {}
for (cname,contents) in extract_contents(entry):
s = ' '.join(contents)
if self.lowercase:
s = uncapitalize(s)
words = self.wordsegment.split(s)
for ng in nltk.ngrams(words, self.n, pad_left=True, pad_right=True):
ret['_'.join([cname]+[x if x != None else '<>' for x in list(ng)])] = True
return ret
def name(self):
return 'NgramExtractor|%(n)s|lc=%(lowercase)s' % self.__dict__
class MediaWikiExtractor:
None
class SentiWordNetExtractor:
def __init__(self, file, threshold=0.1):
table = list(csv.reader(filter(lambda x: x[0] != '#', open(file)), delimiter='\t'))
self.lemmatizer = nltk.stem.WordNetStemmer()
self.threshold = 0.1
self.words = {}
for cols in table:
if len(cols[0]) != 1:
continue
synsets = cols[4].split(' ')
pscore = float(cols[2])
nscore = float(cols[3])
if pscore == 0 and nscore == 0:
continue
for ss in synsets:
(w,n) = ss.split('#')
self.words.setdefault(w, []).append((pscore,nscore))
self.wordsegment = re.compile('[ \{\}\n\\(\)\'>]')
self.avg_scores = {}
for (w, scores) in self.words.items():
a = 0.0
for (p,n) in scores:
a += p - n
self.avg_scores[w] = a/len(scores)
def extract(self, entry):
# TODO: try pos tatgging and sense disambiguation
ret = {}
for (cname,contents) in extract_contents(entry):
words = self.wordsegment.split(uncapitalize(' '.join(contents)))
words = filter(lambda x: len(x) > 0, words)
words = map(lambda x: re.sub('[\?\!\.,;:\-"\']$', '', x), words)
words = map(lambda x: self.lemmatizer.lemmatize(x), words)
for w in words:
if self.avg_scores.has_key(w):
if abs(self.avg_scores[w]) > self.threshold:
ret['_'.join([cname,w])] = self.avg_scores[w]
return ret
def name(self):
return 'SentiWordNetExtractor|%(threshold)f' % self.__dict__
if __name__ == '__main__':
# some examples
for fx in [SentiWordNetExtractor('SentiWordNet_3.0.0_20100908.txt'), NgramExtractor(2), NgramExtractor(2, lowercase=True), WikiPatternExtractor()]:
print fx.name()
print fx.extract({"entry" : { "content" : {'added': ["{{welcome}}\n[[User:Jwrosenzweig|Jwrosenzweig]] 00:37, 1 Feb 2005 (UTC)\nP.S. I've reformatted [[Marrowstone, Washington]] a little so that the external link to Fort Flagler is in the external links section, and so that [[Fort Flagler]] links to the empty article on the fort (maybe you'd take a shot at writing it?). Give it a look if you have time. Thanks for your contributions: they're very appreciated!"], 'removed':[]}, "receiver" : "Vishakha", "sender" : "Jwrosenzweig", "id": {"rev_id" : 17231315}, "title" : "Vishakha", 'comment': [] } })
print fx.extract({"entry" : { "content" : {'added': ["Hi, ElfineM, welcome to Wikipedia. I hope you like the place and choose to [[Wikipedia:Wikipedians|stay]]... Check out [[Template:Welcome]] for some good links to help you get started, if you need to.\n\nJust a quick point, if you want to comment on an article, usually the most recent talk goes at the bottom of the page. I've done this for you at [[Talk:Feminism]].\n\nIf you've any more questions you have, don't hesiste to ask me at my [[User talk:Dysprosia|talk page]], or on the [[Wikipedia:Village pump]].\n\nHave fun! [[User:Dysprosia|Dysprosia]] 05:10, 5 Feb 2005 (UTC)"], 'removed':[]}, "receiver" : "Lincspoacher", "sender" : "Dysprosia", "rev_id" : 10192272, "title" : "Lincspoacher", 'comment': [] } })
print fx.extract({"entry" : { "content" : {'added': ["DO NOT POST THE FINAL AIRING OF WBHS AGAIN...WBHS WILL STAY ON AS LONG AS I AM A STUDENT AT BHS!!!! You are a sick human being to post \"I support Insurgency\" on your page...21:22, 8 February 2007 (UTC)Kgregory21:22, 8 February 2007 (UTC)"], 'removed':[]}, "receiver" : "Trillionaire", "sender" : "Kgregory", "id": {"rev_id" : 10192272}, "title" : "Trillionaire", 'comment': [] } })