-
Notifications
You must be signed in to change notification settings - Fork 5
/
spacy_pos_ner_dep_zh.py
91 lines (75 loc) · 3.81 KB
/
spacy_pos_ner_dep_zh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: utf-8 -*-
"""
Last Updated: Fri Jun 23 23:30:57 2023
@author: Ye Kyaw Thu, Visiting Professor, LST, NECTEC, Thailand.
POS/NER/Dep tagger for Chinese with Spacy Library.
If you don't have:
pip install spacy
python -m spacy download zh_core_web_sm
How to run:
python spacy_pos_dep_zh.py -i zh-sentence.txt -t pos -f column
python spacy_pos_dep_zh.py -i zh-sentence.txt -t ner -f column
python spacy_pos_dep_zh.py -i zh-sentence.txt -t dep -f column
python spacy_pos_dep_zh.py -i zh-sentence.txt -t pos -f left-to-right
python spacy_pos_dep_zh.py -i zh-sentence.txt -t ner -f left-to-right
python spacy_pos_dep_zh.py -i zh-sentence.txt -t dep -f left-to-right
python spacy_pos_dep_zh.py -i zh-sentence.txt -t pos -f left-to-right -to
python spacy_pos_dep_zh.py -i zh-sentence.txt -t ner -f left-to-right -to
python spacy_pos_dep_zh.py -i zh-sentence.txt -t dep -f left-to-right -to
Moreover, -o or --output option is for saving as a file.
"""
import spacy
import argparse
def tag_text(input_file, output_file, tag_type, format_type, tags_only):
nlp = spacy.load("zh_core_web_sm")
with open(input_file, 'r', encoding='utf-8') as f:
text = f.read()
doc = nlp(text)
results = []
if tag_type == 'pos':
if format_type == 'column':
for sent in doc.sents:
for token in sent:
if not token.is_space:
results.append(f'{token.pos_}\n' if tags_only else f'{token.text}/{token.pos_}\n')
results.append('\n')
else:
for sent in doc.sents:
results.append(' '.join([f'{token.pos_}' if tags_only else f'{token.text}/{token.pos_}' for token in sent if not token.is_space]))
results.append('\n')
elif tag_type == 'ner':
if format_type == 'column':
for token in doc:
ent_type = token.ent_type_ if token.ent_type_ else 'O'
results.append(f'{ent_type}\n' if tags_only else f'{token.text}/{ent_type}\n')
results.append('\n')
else:
results.append(' '.join([f'{token.ent_type_ if token.ent_type_ else "O"}' if tags_only else f'{token.text}/{token.ent_type_ if token.ent_type_ else "O"}' for token in doc]))
results.append('\n')
elif tag_type == 'dep':
if format_type == 'column':
for sent in doc.sents:
for token in sent:
if not token.is_space:
results.append(f'{token.dep_}\n' if tags_only else f'{token.text}/{token.dep_}\n')
results.append('\n')
else:
for sent in doc.sents:
results.append(' '.join([f'{token.dep_}' if tags_only else f'{token.text}/{token.dep_}' for token in sent if not token.is_space]))
results.append('\n')
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
f.writelines(results)
else:
print(''.join(results))
def main():
parser = argparse.ArgumentParser(description='Chinese text analysis.')
parser.add_argument('-i', '--input', help='Input file name', required=True)
parser.add_argument('-o', '--output', help='Output file name', required=False)
parser.add_argument('-t', '--tag', help='Tag type (pos, ner, dep)', required=True)
parser.add_argument('-f', '--format', help='Output format (column or left-to-right)', required=False, default='column')
parser.add_argument('-to', '--tags-only', help='Output only tags without words', required=False, action='store_true')
args = parser.parse_args()
tag_text(args.input, args.output, args.tag, args.format, args.tags_only)
if __name__ == '__main__':
main()