-
Notifications
You must be signed in to change notification settings - Fork 0
/
corenlp_parser.py
192 lines (164 loc) · 7.33 KB
/
corenlp_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from __future__ import print_function
import os
import sys
import json
import urllib
import itertools
from tqdm import tqdm
from log_utils import LogUtil
from parser import Parser
from io_utils import read_lines
from ads_parser import AdsParser
from pycorenlp import StanfordCoreNLP
# The following two lines make CoreNLP happy
reload(sys)
sys.setdefaultencoding('UTF8')
class CoreNLPParser(Parser):
""" The CoreNLPParser class builds upon Stanford CoreNLP package """
CORENLP_PARSER = "edu.stanford.nlp.pipeline.CoreNLPServer"
def __init__(self, corenlp_server_url, ner_model,
parser_name='corenlp_parser'):
super(CoreNLPParser, self).__init__(parser_name)
self.corenlp = StanfordCoreNLP(corenlp_server_url)
self.props = {
'annotators': 'tokenize,ssplit,lemma,pos,ner',
'outputFormat': 'json',
# dont want SUTime model
'ner.useSUTime': False,
# Dont want numeric classifier
'ner.applyNumericClassifiers': False,
'timeout': '60000',
# Don't need fine grained recognition with corenlp built-in NER
# models
'ner.applyFineGrained': False
}
if ner_model:
if not os.path.exists(ner_model):
raise RuntimeError('NER model not found: %s' %
os.path.abspath(ner_model))
self.props['ner.model'] = ner_model
def parse(self, text):
""" Named entity recognition (NER) using stanford CoreNLP package
Args:
text (str): A string (can be a long string) in which Named Entity
Recognition will run.
Return:
this function returns a dictionary contains the NERs identified,
sentences extracted, and name of the source parser
"""
if type(text) != str:
text = text.encode('utf8')
if text[0].isspace(): # dont strip white spaces
text = '.' + text[1:]
# Quote (with percent-encoding) reserved characters in URL for CoreNLP
text = urllib.quote(text)
output = self.corenlp.annotate(text, properties=self.props)
# flatten sentences and tokens
tokenlists = [s['tokens'] for s in output['sentences']]
tokens = itertools.chain.from_iterable(tokenlists)
names = []
for token in tokens:
if token['ner'] != 'O':
name = {
'label': token['ner'],
'begin': token['characterOffsetBegin'],
'end': token['characterOffsetEnd'],
'text': token['originalText'],
'source': 'corenlp'
}
names.append(name)
# Handle multi-word tokens:
# Merge any adjacent Target tokens, if of the same type and
# separated by a space, into one span.
names.sort(key=lambda x: int(x['begin']))
new_names = []
skip_names = []
for n in names:
if n in skip_names:
continue
next_name = [n2 for n2 in names if
n['label'] == 'Target' and
n2['label'] == 'Target' and
int(n2['begin']) == int(n['end']) + 1]
if len(next_name) > 0:
n['text'] += ' ' + next_name[0]['text']
n['end'] = next_name[0]['end']
skip_names.append(next_name[0])
# Either way, save this one
new_names.append(n)
return {
'ner': new_names,
'X-Parsed-By': CoreNLPParser.CORENLP_PARSER,
'sentences': output['sentences']
}
def process(in_file, in_list, out_file, log_file, tika_server_url,
corenlp_server_url, ner_model, ads_url, ads_token):
# Log input parameters
logger = LogUtil('corenlp-parser', log_file)
logger.info('Input parameters')
logger.info('in_file: %s' % in_file)
logger.info('in_list: %s' % in_list)
logger.info('out_file: %s' % out_file)
logger.info('tika_server_url: %s' % tika_server_url)
logger.info('corenlp_server_url: %s' % corenlp_server_url)
logger.info('ner_model: %s' % os.path.abspath(ner_model))
logger.info('ads_url: %s' % ads_url)
logger.info('ads_token: %s' % ads_token)
if in_file and in_list:
print('[ERROR] in_file and in_list cannot be provided simultaneously')
sys.exit(1)
ads_parser = AdsParser(ads_token, ads_url, tika_server_url)
corenlp_parser = CoreNLPParser(corenlp_server_url, ner_model)
if in_file:
files = [in_file]
else:
files = read_lines(in_list)
out_f = open(out_file, 'wb', 1)
for f in tqdm(files):
try:
ads_dict = ads_parser.parse(f)
corenlp_dict = corenlp_parser.parse(ads_dict['content'])
ads_dict['metadata']['ner'] = corenlp_dict['ner']
ads_dict['metadata']['X-Parsed-By'].append(corenlp_dict['X-Parsed-By'])
ads_dict['metadata']['sentences'] = corenlp_dict['sentences']
out_f.write(json.dumps(ads_dict))
out_f.write('\n')
except Exception as e:
logger.info('CoreNLP parser failed: %s' % os.path.abspath(f))
logger.error(e)
out_f.close()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
input_parser = parser.add_mutually_exclusive_group(required=True)
input_parser.add_argument('-i', '--in_file', help='Path to input file')
input_parser.add_argument('-li', '--in_list', help='Path to input list')
parser.add_argument('-o', '--out_file', required=True,
help='Path to output JSON file')
parser.add_argument('-l', '--log_file', default='./corenlp-parser-log.txt',
help='Log file that contains processing information. '
'It is default to ./corenlp-parser-log.txt unless '
'otherwise specified.')
parser.add_argument('-p', '--tika_server_url', required=False,
help='Tika server URL')
parser.add_argument('-c', '--corenlp_server_url',
default='http://localhost:9000',
help='CoreNLP Server URL')
parser.add_argument('-n', '--ner_model', required=False,
help='Path to a Named Entity Recognition (NER) model ')
parser.add_argument('-a', '--ads_url',
default='https://api.adsabs.harvard.edu/v1/search/query',
help='ADS RESTful API. The ADS RESTful API should not '
'need to be changed frequently unless someting at '
'the ADS is changed.')
parser.add_argument('-t', '--ads_token',
default='jON4eu4X43ENUI5ugKYc6GZtoywF376KkKXWzV8U',
help='The ADS token, which is required to use the ADS '
'RESTful API. The token was obtained using the '
'instructions at '
'https://github.com/adsabs/adsabs-dev-api#access. '
'The ADS token should not need to be changed '
'frequently unless something at the ADS is '
'changed.')
args = parser.parse_args()
process(**vars(args))