Skip to content

Commit c68de12

Browse files
unescape unicode literal
1 parent 3f75b7c commit c68de12

File tree

1 file changed

+17
-2
lines changed

1 file changed

+17
-2
lines changed

docs/ssyn2es.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import argparse
44
import fileinput
5+
import re
56
import sys
67
import unicodedata
78

@@ -27,23 +28,37 @@ def load_synonyms(files, output_predicate, discard_punctuation):
2728
line = line.strip()
2829
if line == "":
2930
continue
31+
3032
entry = line.split(",")[0:9]
33+
headword = unescape(entry[8])
3134

3235
is_deleted = (entry[2] == "2")
3336
is_predicate = (entry[1] == "2")
3437
if is_deleted or (is_predicate and not output_predicate):
3538
continue
36-
if (is_punctuation_word(entry[8]) and discard_punctuation):
39+
if (is_punctuation_word(headword) and discard_punctuation):
3740
print(f"skip punctuation entry {entry[8]} at line {i}",
3841
file=sys.stderr)
3942
continue
4043

4144
group = synonyms.setdefault(entry[0], [[], []])
42-
group[1 if entry[2] == "1" else 0].append(entry[8])
45+
group[1 if entry[2] == "1" else 0].append(headword)
4346

4447
return synonyms
4548

4649

50+
unicode_literal_pattern = re.compile(
51+
r"""\\u([0-9a-fA-F]{4}|\{[0-9a-fA-F]+\})""")
52+
53+
54+
def _repl_uncode_literal(m):
55+
return chr(int(m.group(1).strip("{}"), 16))
56+
57+
58+
def unescape(word):
59+
return unicode_literal_pattern.sub(_repl_uncode_literal, word)
60+
61+
4762
# Unicode General Category list, that is used for punctuation in elasticsearch_sudachi
4863
# see: com.worksap.nlp.lucene.sudachi.ja.util.Strings
4964
punctuation_categories = [

0 commit comments

Comments
 (0)