/
nlp.py
183 lines (155 loc) · 7.17 KB
/
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
from __future__ import division
from past.utils import old_div
import nltk, re, pprint
import autonomic
from bs4 import BeautifulSoup
from rdflib import *
from slugify import slugify
import nanopub
from math import log10
import collections
from whyis.namespace import sioc_types, sioc, sio, dc, prov, whyis
class HTML2Text(autonomic.UpdateChangeService):
activity_class = whyis.TextFromHTML
def getInputClass(self):
return sioc.Post
def getOutputClass(self):
return URIRef("http://purl.org/dc/dcmitype/Text")
def get_query(self):
return '''select ?resource where { ?resource <http://rdfs.org/sioc/ns#content> [].}'''
def process(self, i, o):
content = i.value(sioc.content)
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text("\n")
o.add(URIRef("http://schema.org/text"), Literal(text))
class IDFCalculator(autonomic.UpdateChangeService):
activity_class = whyis.InverseDocumentFrequencyCalculation
def getInputClass(self):
return autonomic.whyis.ResolvedText
def getOutputClass(self):
return autonomic.whyis.TFIDFText
@property
def document_count(self):
return list(self.app.db.query('''select (count(distinct ?node) as ?count) where {
?node sio:hasPart [ prov:specializationOf ?concept; a sio:Term].
}''',
initNs=dict(sio=sio, prov=prov)))[0][0].value
def process(self, i, o):
document_count = float(self.document_count)
query = """select distinct ?concept (count(distinct ?othernode) as ?count) ?assertion where {
?node sio:hasPart [ a sio:Term; prov:specializationOf ?concept].
?othernode sio:hasPart [ a sio:Term; prov:specializationOf ?concept].
optional {
graph ?assertion {
?concept sio:InverseDocumentFrequency ?idf.
}
}
} group by ?concept ?assertion"""
for concept, count, assertion in self.app.db.query(query, initBindings=dict(node=i.identifier)):
idf = log10(old_div(document_count,count.value))
npub = nanopub.Nanopublication(store=o.graph.store)
if assertion is not None:
npub.pubinfo.add((npub.assertion.identifier, prov.wasRevisionOf, assertion))
npub.assertion.add((concept, sio.InverseDocumentFrequency, Literal(idf)))
class EntityResolver(autonomic.UpdateChangeService):
activity_class = whyis.EntityResolution
def getInputClass(self):
return autonomic.whyis.ExtractedText
def getOutputClass(self):
return autonomic.whyis.ResolvedText
def resolve(self, term, context):
query = """prefix skos: <http://www.w3.org/2004/02/skos/core#>
prefix foaf: <http://xmlns.com/foaf/0.1/>
prefix bds: <http://www.bigdata.com/rdf/search#>
select distinct ?node ?label (coalesce(?relevance+?cr, ?relevance) as ?score) ?relevance ?cr where {
?node dc:title|rdfs:label|skos:prefLabel|skos:altLabel|foaf:name ?label.
?label bds:search '''%s''';
bds:matchAllTerms "false";
bds:relevance ?relevance ;
bds:minRelevance 0.4.
optional {
?node ?p ?context.
?context bds:search '''%s''';
bds:matchAllTerms "false";
bds:relevance ?cr ;
bds:minRelevance 0.4.
}
filter not exists {
?node a <http://semanticscience.org/resource/Term>
}
filter not exists {
?node a <http://www.nanopub.org/nschema#Nanopublication>
}
filter not exists {
?node a <http://www.nanopub.org/nschema#Assertion>
}
filter not exists {
?node a <http://www.nanopub.org/nschema#Provenance>
}
filter not exists {
?node a <http://www.nanopub.org/nschema#PublicationInfo>
}
} order by desc(?score) limit 10""" % (term, context)
for node, label, score, relevance, cr in self.app.db.query(query):
return node, label, score
return None, None, None
def process(self, i, o):
#context = ' '.join([term.value(sio.hasValue) for term in i[sio.hasPart]][:20])
context = ' '.join([term.value(sio.hasValue) for term
in sorted(i[sio.hasPart], reverse=True, key=lambda term: term.value(sio.Frequency))[:20]])
for term in i[sio.hasPart]:
term_label = term.value(sio.hasValue)
o_term = o.graph.resource(term.identifier)
node, score, label = self.resolve(term_label, context)
if node is not None:
#o_term.add(RDFS.label, label)
o_term.add(autonomic.prov.specializationOf, node)
o.add(dc.subject, node)
class EntityExtractor(autonomic.UpdateChangeService):
activity_class = whyis.EntityExtraction
grammar = r"""
NP: {<DT|PP\$>?<JJ>*<NN>+} # chunk determiner/possessive, adjectives and noun
{<NNP>+<NN>*} # chunk sequences of proper nouns
}<VBD|IN|DT|PP>+{ # Chink sequences of VBD and IN
"""
cp = nltk.RegexpParser(grammar)
property_path = "<http://schema.org/text>|<http://purl.org/dc/terms/summary>|<http://purl.org/dc/terms/abstract>|<http://purl.org/dc/terms/description>|<http://www.w3.org/2000/01/rdf-schema#comment>|<http://www.w3.org/2004/02/skos/core#definition>"
def __init__(self, property_path=None):
nltk.download('all', quiet=True)
if property_path is not None:
self.property_path = property_path
def getInputClass(self):
return URIRef("http://purl.org/dc/dcmitype/Text")
def getOutputClass(self):
return autonomic.whyis.ExtractedText
def get_query(self):
return '''select ?resource where { ?resource <http://schema.org/text> [].}'''
def process(self, i, o):
documents = self.app.db.query('''select ?text where { %s %s ?text.}''' % (i.identifier.n3(), self.property_path))
tf = self.tf(documents)
#print tf
for t, f in list(tf.items()):
term = o.graph.resource(URIRef(i.identifier+"-term-"+slugify(t)))
term.add(RDF.type, sio.Term)
term.add(sio.hasValue, Literal(t))
#term.add(RDFS.label, Literal(t))
term.add(sio.Frequency, Literal(f))
o.add(sio.hasPart, term)
def tf(self, documents):
term_vector = collections.defaultdict(float)
all_mentions = []
for document, in documents:
#print document.value
document = document.value.replace("\n",".\n") # make sure discrete lines become individual sentences.
document = document.replace("..\n",".\n") # remove the extra periods
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
sentences = [self.cp.parse(s) for s in sentences]
nps = [subtree for s in sentences for subtree in s.subtrees() if subtree.label() == "NP"]
mentions = [re.sub("\.$","",' '.join([word for word, pos in np.leaves()]).lower()) for np in nps]
all_mentions.extend(mentions)
for mention in mentions:
term_vector[mention] += 1
tf_vector = dict([(mention, old_div(count,len(all_mentions))) for mention, count in list(term_vector.items())])
return tf_vector