Skip to content

Commit ed2815f

Browse files
Merge pull request #162 from WorksApplications/feature/update-ssyn2es
update ssyn2es
2 parents 07753ff + 52ddc64 commit ed2815f

File tree

2 files changed

+101
-15
lines changed

2 files changed

+101
-15
lines changed

docs/ssyn2es.py

Lines changed: 98 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,121 @@
22

33
import argparse
44
import fileinput
5+
import re
6+
import sys
7+
import unicodedata
58

6-
def main():
7-
parser = argparse.ArgumentParser(prog="ssyn2es.py", description="convert Sudachi synonyms to ES")
8-
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
9-
parser.add_argument('-p', '--output-predicate', action='store_true', help='output predicates')
9+
10+
def parse_args():
11+
parser = argparse.ArgumentParser(
12+
prog="ssyn2es.py", description="convert Sudachi synonyms to Solr format")
13+
parser.add_argument('files', metavar='FILE', nargs='*',
14+
help='files to read, if empty, stdin is used')
15+
16+
parser.add_argument("--discard-punctuation", action='store_true',
17+
help='if set, skip words that consist of puctuation chars')
18+
parser.add_argument('-p', '--output-predicate', action='store_true',
19+
help='if set, output predicates')
1020
args = parser.parse_args()
21+
return args
22+
1123

24+
def load_synonyms(files, output_predicate, discard_punctuation):
1225
synonyms = {}
13-
with fileinput.input(files = args.files) as input:
14-
for line in input:
26+
with fileinput.input(files=files) as input:
27+
for i, line in enumerate(input):
1528
line = line.strip()
1629
if line == "":
1730
continue
31+
1832
entry = line.split(",")[0:9]
19-
if entry[2] == "2" or (not args.output_predicate and entry[1] == "2"):
33+
headword = escape_comma(unescape_unicode_literal(entry[8]))
34+
35+
is_deleted = (entry[2] == "2")
36+
is_predicate = (entry[1] == "2")
37+
if is_deleted or (is_predicate and not output_predicate):
38+
continue
39+
if (is_punctuation_word(headword) and discard_punctuation):
40+
print(f"skip punctuation entry {entry[8]} at line {i}",
41+
file=sys.stderr)
2042
continue
43+
2144
group = synonyms.setdefault(entry[0], [[], []])
22-
group[1 if entry[2] == "1" else 0].append(entry[8])
45+
group[1 if entry[2] == "1" else 0].append(headword)
46+
47+
return synonyms
48+
49+
50+
unicode_literal_pattern = re.compile(
51+
r"""\\u([0-9a-fA-F]{4}|\{[0-9a-fA-F]+\})""")
52+
53+
54+
def _repl_uncode_literal(m):
55+
return chr(int(m.group(1).strip("{}"), 16))
2356

57+
58+
def unescape_unicode_literal(word):
59+
return unicode_literal_pattern.sub(_repl_uncode_literal, word)
60+
61+
62+
def escape_comma(word):
63+
return word.replace(",", "\,")
64+
65+
66+
# Unicode General Category list, that is used for punctuation in elasticsearch_sudachi
67+
# see: com.worksap.nlp.lucene.sudachi.ja.util.Strings
68+
punctuation_categories = [
69+
"Zs", # Character.SPACE_SEPARATOR
70+
"Zl", # Character.LINE_SEPARATOR
71+
"Zp", # Character.PARAGRAPH_SEPARATOR
72+
"Cc", # Character.CONTROL
73+
"Cf", # Character.FORMAT
74+
"Pd", # Character.DASH_PUNCTUATION
75+
"Ps", # Character.START_PUNCTUATION
76+
"Pe", # Character.END_PUNCTUATION
77+
"Pc", # Character.CONNECTOR_PUNCTUATION
78+
"Po", # Character.OTHER_PUNCTUATION
79+
"Sm", # Character.MATH_SYMBOL
80+
"Sc", # Character.CURRENCY_SYMBOL
81+
"Sk", # Character.MODIFIER_SYMBOL
82+
"So", # Character.OTHER_SYMBOL
83+
"Pi", # Character.INITIAL_QUOTE_PUNCTUATION
84+
"Pf", # Character.FINAL_QUOTE_PUNCTUATION
85+
]
86+
87+
88+
def is_punctuation_word(word: str):
89+
# return True if all characters are in punctuation categories.
90+
for c in word:
91+
category = unicodedata.category(c)
92+
if category not in punctuation_categories:
93+
return False
94+
return True
95+
96+
97+
def dump_synonyms(synonyms, file=None):
2498
for groupid in sorted(synonyms):
2599
group = synonyms[groupid]
26100
if not group[1]:
27101
if len(group[0]) > 1:
28-
print(",".join(group[0]))
102+
print(",".join(group[0]), file=file)
29103
else:
30104
if len(group[0]) > 0 and len(group[1]) > 0:
31-
print(",".join(group[0]) + "=>" + ",".join(group[0] + group[1]))
105+
print(",".join(group[0]) + "=>" +
106+
",".join(group[0] + group[1]), file=file)
107+
return
108+
109+
110+
def main():
111+
args = parse_args()
112+
113+
synonyms = load_synonyms(
114+
args.files,
115+
args.output_predicate,
116+
args.discard_punctuation,
117+
)
118+
dump_synonyms(synonyms)
32119

33120

34121
if __name__ == "__main__":
35-
main()
122+
main()

docs/synonym.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,16 @@ You can partially make use of the Sudachi synonym resource's detailed informatio
3030

3131
### Punctuation Symbols
3232

33-
You may need to remove certain synonym words such as `` and `` when you use the analyzer with setting `"discard_punctuation": true` (Otherwise you will be get an error, e.g., `"term: € was completely eliminated by analyzer"`). Alternatively, you can set `"lenient": true` for the synonym filter to ignore the exceptions.
33+
You may need to remove certain synonym words such as `` and `` when you use the analyzer with setting `"discard_punctuation": true` (Otherwise you will be get an error, e.g., `"term: € was completely eliminated by analyzer"`). If you are using [ssyn2es.py](./ssyn2es.py), use `--discard-punctuation` option to skip those words. Alternatively, you can set `"lenient": true` for the synonym filter to ignore the exceptions.
3434

35-
These symbols are defined as punctuations; See [SudachiTokenizer.java](https://github.com/WorksApplications/elasticsearch-sudachi/blob/develop/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java#L140) for the detail.
35+
These symbols are defined as punctuations; See [Strings.java](https://github.com/WorksApplications/elasticsearch-sudachi/blob/develop/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Strings.java) for the detail.
3636

3737

3838
## Synonym Filter
3939

4040
You can use the converted Solr format file with Elasticsearch's default synonym filters, [Synonym token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html) or [Synonym graph filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html).
4141

42+
As `sudachi_split` filter produces token graph, you *cannot* use it with synonym filter.
4243

4344
### Example: Set up
4445

@@ -73,8 +74,6 @@ You can use the converted Solr format file with Elasticsearch's default synonym
7374

7475
Here we assume that the converted synonym file is placed as `$ES_PATH_CONF/sudachi/synonym.txt`.
7576

76-
If you would like to use `sudachi_split` filter, set it *after* the synonym filter (otherwise you will get an error, e.g., `term: 不明確 analyzed to a token (不) with position increment != 1 (got: 0)`).
77-
7877

7978
### Example: Analysis
8079

0 commit comments

Comments
 (0)