Merge pull request #582 from xxyzz/zh

Fix pydantic errors and remove empty glosses for zh edition
tatuylonen · Apr 12, 2024 · ee658f9 · ee658f9
2 parents 0ca46a6 + 2756cf7
commit ee658f9
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 12 deletions.
diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
@@ -49,7 +49,8 @@ def extract_gloss(
             gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
         new_gloss_data = gloss_data.model_copy(deep=True)
         new_gloss_data.raw_tags.extend(raw_tags)
-        new_gloss_data.glosses.append(gloss_text)
+        if len(gloss_text) > 0:
+            new_gloss_data.glosses.append(gloss_text)
         if len(ruby_data) > 0:
             new_gloss_data.ruby = ruby_data
 
@@ -62,6 +63,6 @@ def extract_gloss(
                 else:  # example list
                     extract_examples(wxr, new_gloss_data, child_node)
 
-        if not has_nested_gloss:
+        if not has_nested_gloss and len(new_gloss_data.glosses) > 0:
             translate_raw_tags(new_gloss_data)
             page_data[-1].senses.append(new_gloss_data)
diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
@@ -26,7 +26,9 @@ class Example(ChineseBaseModel):
         "",
         description="Source of the sentence, like book title and page number",
     )
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class Sense(ChineseBaseModel):
@@ -36,15 +38,19 @@ class Sense(ChineseBaseModel):
     topics: list[str] = []
     categories: list[str] = []
     examples: list[Example] = []
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class Form(ChineseBaseModel):
     form: str = ""
     tags: list[str] = []
     raw_tags: list[str] = []
     source: str = ""
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
     hiragana: str = ""
     roman: str = ""
 
@@ -87,7 +93,9 @@ class Linkage(ChineseBaseModel):
     language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field(
         "", description="Chinese character variant"
     )
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class Descendant(ChineseBaseModel):
@@ -98,7 +106,9 @@ class Descendant(ChineseBaseModel):
     tags: list[str] = []
     raw_tags: list[str] = []
     descendants: list["Descendant"] = []
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class WordEntry(ChineseBaseModel):

diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
@@ -149,7 +149,7 @@ def process_pos_block(
                 parse_section(wxr, page_data, base_data, child)
         else:
             parse_section(wxr, page_data, base_data, child)
-    if len(page_data[-1].senses) == 0:
+    if len(page_data[-1].senses) == 0 and not node.contain_node(NodeKind.LIST):
         # low quality pages don't put gloss in list
         gloss_text = clean_node(
             wxr, page_data[-1], list(node.invert_find_child(LEVEL_KIND_FLAGS))

diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
@@ -44,7 +44,7 @@ def test_ruby(self):
                 "lang_code": "ja",
                 "lang": "日語",
                 "roman": "nīhao",
-                "ruby": [["你好", "ニイハオ"]],
+                "ruby": [("你好", "ニイハオ")],
                 "word": "你好",
             },
         )

diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
@@ -115,7 +115,7 @@ def test_soft_redirect_zh_see(self):
                     "lang_code": "zh",
                     "pos": "soft-redirect",
                     "redirects": ["別個"],
-                    'senses': [{'tags': ['no-gloss']}],
+                    "senses": [{"tags": ["no-gloss"]}],
                     "word": "別个",
                 }
             ],
@@ -135,7 +135,7 @@ def test_soft_redirect_ja_see(self):
                     "lang_code": "ja",
                     "pos": "soft-redirect",
                     "redirects": ["如月", "二月", "更衣", "衣更着"],
-                    'senses': [{'tags': ['no-gloss']}],
+                    "senses": [{"tags": ["no-gloss"]}],
                     "word": "きさらぎ",
                 }
             ],
@@ -245,3 +245,25 @@ def test_two_label_topics(self):
                 }
             ],
         )
+
+    def test_empty_parent_gloss(self):
+        self.wxr.wtp.start_page("bright")
+        self.wxr.wtp.add_page("Template:lb", 10, "({{{2}}})")
+        root = self.wxr.wtp.parse("""# {{lb|en|比喻义}}
+## [[显然]]的，[[显眼]]的
+## {{lb|en|指颜色}} [[鲜亮]]的，[[鲜艳]]的""")
+        page_data = [WordEntry(word="", lang_code="", lang="", pos="")]
+        extract_gloss(self.wxr, page_data, root.children[0], Sense())
+        self.assertEqual(
+            page_data[0].model_dump(exclude_defaults=True)["senses"],
+            [
+                {
+                    "glosses": ["显然的，显眼的"],
+                    "raw_tags": ["比喻义"],
+                },
+                {
+                    "glosses": ["鲜亮的，鲜艳的"],
+                    "raw_tags": ["比喻义", "指颜色"],
+                },
+            ],
+        )
diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py
@@ -65,7 +65,7 @@ def test_ja_r_template(self):
             page_data[0].synonyms[0].model_dump(exclude_defaults=True),
             {
                 "roman": "yanushi",
-                "ruby": [["家", "や"], ["主", "ぬし"]],
+                "ruby": [("家", "や"), ("主", "ぬし")],
                 "sense": "房東",
                 "word": "家主",
             },