From 32e30e8d1e0471516f3d2c04a29eda74da877818 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 12 Apr 2024 10:19:01 +0800
Subject: [PATCH 1/3] Don't add empty gloss from some zh edition nested gloss
 lists

---
 src/wiktextract/extractor/zh/gloss.py |  5 +++--
 tests/test_zh_gloss.py                | 26 ++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
index 893c5848..c50f3939 100644
--- a/src/wiktextract/extractor/zh/gloss.py
+++ b/src/wiktextract/extractor/zh/gloss.py
@@ -49,7 +49,8 @@ def extract_gloss(
             gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
         new_gloss_data = gloss_data.model_copy(deep=True)
         new_gloss_data.raw_tags.extend(raw_tags)
-        new_gloss_data.glosses.append(gloss_text)
+        if len(gloss_text) > 0:
+            new_gloss_data.glosses.append(gloss_text)
         if len(ruby_data) > 0:
             new_gloss_data.ruby = ruby_data
 
@@ -62,6 +63,6 @@ def extract_gloss(
                 else:  # example list
                     extract_examples(wxr, new_gloss_data, child_node)
 
-        if not has_nested_gloss:
+        if not has_nested_gloss and len(new_gloss_data.glosses) > 0:
             translate_raw_tags(new_gloss_data)
             page_data[-1].senses.append(new_gloss_data)
diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
index 32a17ac6..46fdc9c7 100644
--- a/tests/test_zh_gloss.py
+++ b/tests/test_zh_gloss.py
@@ -115,7 +115,7 @@ def test_soft_redirect_zh_see(self):
                     "lang_code": "zh",
                     "pos": "soft-redirect",
                     "redirects": ["別個"],
-                    'senses': [{'tags': ['no-gloss']}],
+                    "senses": [{"tags": ["no-gloss"]}],
                     "word": "別个",
                 }
             ],
@@ -135,7 +135,7 @@ def test_soft_redirect_ja_see(self):
                     "lang_code": "ja",
                     "pos": "soft-redirect",
                     "redirects": ["如月", "二月", "更衣", "衣更着"],
-                    'senses': [{'tags': ['no-gloss']}],
+                    "senses": [{"tags": ["no-gloss"]}],
                     "word": "きさらぎ",
                 }
             ],
@@ -245,3 +245,25 @@ def test_two_label_topics(self):
                 }
             ],
         )
+
+    def test_empty_parent_gloss(self):
+        self.wxr.wtp.start_page("bright")
+        self.wxr.wtp.add_page("Template:lb", 10, "({{{2}}})")
+        root = self.wxr.wtp.parse("""# {{lb|en|比喻义}}
+## [[显然]]的，[[显眼]]的
+## {{lb|en|指颜色}} [[鲜亮]]的，[[鲜艳]]的""")
+        page_data = [WordEntry(word="", lang_code="", lang="", pos="")]
+        extract_gloss(self.wxr, page_data, root.children[0], Sense())
+        self.assertEqual(
+            page_data[0].model_dump(exclude_defaults=True)["senses"],
+            [
+                {
+                    "glosses": ["显然的，显眼的"],
+                    "raw_tags": ["比喻义"],
+                },
+                {
+                    "glosses": ["鲜亮的，鲜艳的"],
+                    "raw_tags": ["比喻义", "指颜色"],
+                },
+            ],
+        )

From b4940e075943a1300b732cad4e29b213148ae2fa Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 12 Apr 2024 13:27:23 +0800
Subject: [PATCH 2/3] Fix "ruby" fields validation errors in zh edition

---
 src/wiktextract/extractor/zh/models.py | 20 +++++++++++++++-----
 tests/test_zh_descendant.py            |  2 +-
 tests/test_zh_linkage.py               |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
index 58c14e69..a0ce3cb2 100644
--- a/src/wiktextract/extractor/zh/models.py
+++ b/src/wiktextract/extractor/zh/models.py
@@ -26,7 +26,9 @@ class Example(ChineseBaseModel):
         "",
         description="Source of the sentence, like book title and page number",
     )
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class Sense(ChineseBaseModel):
@@ -36,7 +38,9 @@ class Sense(ChineseBaseModel):
     topics: list[str] = []
     categories: list[str] = []
     examples: list[Example] = []
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class Form(ChineseBaseModel):
@@ -44,7 +48,9 @@ class Form(ChineseBaseModel):
     tags: list[str] = []
     raw_tags: list[str] = []
     source: str = ""
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
     hiragana: str = ""
     roman: str = ""
 
@@ -87,7 +93,9 @@ class Linkage(ChineseBaseModel):
     language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field(
         "", description="Chinese character variant"
     )
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class Descendant(ChineseBaseModel):
@@ -98,7 +106,9 @@ class Descendant(ChineseBaseModel):
     tags: list[str] = []
     raw_tags: list[str] = []
     descendants: list["Descendant"] = []
-    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    ruby: list[tuple[str, ...]] = Field(
+        [], description="Japanese Kanji and furigana"
+    )
 
 
 class WordEntry(ChineseBaseModel):
diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
index 2f53484b..8e4d173c 100644
--- a/tests/test_zh_descendant.py
+++ b/tests/test_zh_descendant.py
@@ -44,7 +44,7 @@ def test_ruby(self):
                 "lang_code": "ja",
                 "lang": "日語",
                 "roman": "nīhao",
-                "ruby": [["你好", "ニイハオ"]],
+                "ruby": [("你好", "ニイハオ")],
                 "word": "你好",
             },
         )
diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py
index 5b4852b1..1fb3efc9 100644
--- a/tests/test_zh_linkage.py
+++ b/tests/test_zh_linkage.py
@@ -65,7 +65,7 @@ def test_ja_r_template(self):
             page_data[0].synonyms[0].model_dump(exclude_defaults=True),
             {
                 "roman": "yanushi",
-                "ruby": [["家", "や"], ["主", "ぬし"]],
+                "ruby": [("家", "や"), ("主", "ぬし")],
                 "sense": "房東",
                 "word": "家主",
             },

From 2756cf7d31c510b02f5cdb439bf1cc32f789aa93 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 12 Apr 2024 13:35:05 +0800
Subject: [PATCH 3/3] Don't add empty list text "#" as gloss
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are many low quality zh edition pages only have an empty gloss
list. Example: "出鍋入火"
---
 src/wiktextract/extractor/zh/page.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
index 6e15860f..9452c373 100644
--- a/src/wiktextract/extractor/zh/page.py
+++ b/src/wiktextract/extractor/zh/page.py
@@ -149,7 +149,7 @@ def process_pos_block(
                 parse_section(wxr, page_data, base_data, child)
         else:
             parse_section(wxr, page_data, base_data, child)
-    if len(page_data[-1].senses) == 0:
+    if len(page_data[-1].senses) == 0 and not node.contain_node(NodeKind.LIST):
         # low quality pages don't put gloss in list
         gloss_text = clean_node(
             wxr, page_data[-1], list(node.invert_find_child(LEVEL_KIND_FLAGS))