Merge pull request #162 from WorksApplications/feature/update-ssyn2es

mh-northlander · web-flow · commit ed2815f371f5 · 2025-02-26T17:49:50.000+09:00
update ssyn2es
diff --git a/docs/ssyn2es.py b/docs/ssyn2es.py
@@ -2,34 +2,121 @@
 
 import argparse
 import fileinput
+import re
+import sys
+import unicodedata
 
-def main():
-    parser = argparse.ArgumentParser(prog="ssyn2es.py", description="convert Sudachi synonyms to ES")
-    parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
-    parser.add_argument('-p', '--output-predicate', action='store_true', help='output predicates')
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="ssyn2es.py", description="convert Sudachi synonyms to Solr format")
+    parser.add_argument('files', metavar='FILE', nargs='*',
+                        help='files to read, if empty, stdin is used')
+
+    parser.add_argument("--discard-punctuation", action='store_true',
+                        help='if set, skip words that consist of puctuation chars')
+    parser.add_argument('-p', '--output-predicate', action='store_true',
+                        help='if set, output predicates')
     args = parser.parse_args()
+    return args
+
 
+def load_synonyms(files, output_predicate, discard_punctuation):
     synonyms = {}
-    with fileinput.input(files = args.files) as input:
-        for line in input:
+    with fileinput.input(files=files) as input:
+        for i, line in enumerate(input):
             line = line.strip()
             if line == "":
                 continue
+
             entry = line.split(",")[0:9]
-            if entry[2] == "2" or (not args.output_predicate and entry[1] == "2"):
+            headword = escape_comma(unescape_unicode_literal(entry[8]))
+
+            is_deleted = (entry[2] == "2")
+            is_predicate = (entry[1] == "2")
+            if is_deleted or (is_predicate and not output_predicate):
+                continue
+            if (is_punctuation_word(headword) and discard_punctuation):
+                print(f"skip punctuation entry {entry[8]} at line {i}",
+                      file=sys.stderr)
                 continue
+
             group = synonyms.setdefault(entry[0], [[], []])
-            group[1 if entry[2] == "1" else 0].append(entry[8])
+            group[1 if entry[2] == "1" else 0].append(headword)
+
+    return synonyms
+
+
+unicode_literal_pattern = re.compile(
+    r"""\\u([0-9a-fA-F]{4}|\{[0-9a-fA-F]+\})""")
+
+
+def _repl_uncode_literal(m):
+    return chr(int(m.group(1).strip("{}"), 16))
 
+
+def unescape_unicode_literal(word):
+    return unicode_literal_pattern.sub(_repl_uncode_literal, word)
+
+
+def escape_comma(word):
+    return word.replace(",", "\,")
+
+
+# Unicode General Category list, that is used for punctuation in elasticsearch_sudachi
+# see: com.worksap.nlp.lucene.sudachi.ja.util.Strings
+punctuation_categories = [
+    "Zs",  # Character.SPACE_SEPARATOR
+    "Zl",  # Character.LINE_SEPARATOR
+    "Zp",  # Character.PARAGRAPH_SEPARATOR
+    "Cc",  # Character.CONTROL
+    "Cf",  # Character.FORMAT
+    "Pd",  # Character.DASH_PUNCTUATION
+    "Ps",  # Character.START_PUNCTUATION
+    "Pe",  # Character.END_PUNCTUATION
+    "Pc",  # Character.CONNECTOR_PUNCTUATION
+    "Po",  # Character.OTHER_PUNCTUATION
+    "Sm",  # Character.MATH_SYMBOL
+    "Sc",  # Character.CURRENCY_SYMBOL
+    "Sk",  # Character.MODIFIER_SYMBOL
+    "So",  # Character.OTHER_SYMBOL
+    "Pi",  # Character.INITIAL_QUOTE_PUNCTUATION
+    "Pf",  # Character.FINAL_QUOTE_PUNCTUATION
+]
+
+
+def is_punctuation_word(word: str):
+    # return True if all characters are in punctuation categories.
+    for c in word:
+        category = unicodedata.category(c)
+        if category not in punctuation_categories:
+            return False
+    return True
+
+
+def dump_synonyms(synonyms, file=None):
     for groupid in sorted(synonyms):
         group = synonyms[groupid]
         if not group[1]:
             if len(group[0]) > 1:
-                print(",".join(group[0]))
+                print(",".join(group[0]), file=file)
         else:
             if len(group[0]) > 0 and len(group[1]) > 0:
-                print(",".join(group[0]) + "=>" + ",".join(group[0] + group[1]))
+                print(",".join(group[0]) + "=>" +
+                      ",".join(group[0] + group[1]), file=file)
+    return
+
+
+def main():
+    args = parse_args()
+
+    synonyms = load_synonyms(
+        args.files,
+        args.output_predicate,
+        args.discard_punctuation,
+    )
+    dump_synonyms(synonyms)
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/docs/synonym.md b/docs/synonym.md
@@ -30,15 +30,16 @@ You can partially make use of the Sudachi synonym resource's detailed informatio
 
 ### Punctuation Symbols
 
-You may need to remove certain synonym words such as `€` and `＆` when you use the analyzer with setting `"discard_punctuation": true` (Otherwise you will be get an error, e.g., `"term: € was completely eliminated by analyzer"`). Alternatively, you can set `"lenient": true` for the synonym filter to ignore the exceptions.
+You may need to remove certain synonym words such as `€` and `＆` when you use the analyzer with setting `"discard_punctuation": true` (Otherwise you will be get an error, e.g., `"term: € was completely eliminated by analyzer"`). If you are using [ssyn2es.py](./ssyn2es.py), use `--discard-punctuation` option to skip those words. Alternatively, you can set `"lenient": true` for the synonym filter to ignore the exceptions.
 
-These symbols are defined as punctuations; See [SudachiTokenizer.java](https://github.com/WorksApplications/elasticsearch-sudachi/blob/develop/src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiTokenizer.java#L140) for the detail.
+These symbols are defined as punctuations; See [Strings.java](https://github.com/WorksApplications/elasticsearch-sudachi/blob/develop/src/main/java/com/worksap/nlp/lucene/sudachi/ja/util/Strings.java) for the detail.
 
 
 ## Synonym Filter
 
 You can use the converted Solr format file with Elasticsearch's default synonym filters, [Synonym token filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html) or [Synonym graph filter](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html).
 
+As `sudachi_split` filter produces token graph, you *cannot* use it with synonym filter.
 
 ### Example: Set up
 
@@ -73,8 +74,6 @@ You can use the converted Solr format file with Elasticsearch's default synonym
 
 Here we assume that the converted synonym file is placed as `$ES_PATH_CONF/sudachi/synonym.txt`.
 
-If you would like to use `sudachi_split` filter, set it *after* the synonym filter (otherwise you will get an error, e.g., `term: 不明確 analyzed to a token (不) with position increment != 1 (got: 0)`).
-
 
 ### Example: Analysis