From 32e30e8d1e0471516f3d2c04a29eda74da877818 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 12 Apr 2024 10:19:01 +0800 Subject: [PATCH 1/3] Don't add empty gloss from some zh edition nested gloss lists --- src/wiktextract/extractor/zh/gloss.py | 5 +++-- tests/test_zh_gloss.py | 26 ++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index 893c5848..c50f3939 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -49,7 +49,8 @@ def extract_gloss( gloss_text = clean_node(wxr, gloss_data, gloss_nodes) new_gloss_data = gloss_data.model_copy(deep=True) new_gloss_data.raw_tags.extend(raw_tags) - new_gloss_data.glosses.append(gloss_text) + if len(gloss_text) > 0: + new_gloss_data.glosses.append(gloss_text) if len(ruby_data) > 0: new_gloss_data.ruby = ruby_data @@ -62,6 +63,6 @@ def extract_gloss( else: # example list extract_examples(wxr, new_gloss_data, child_node) - if not has_nested_gloss: + if not has_nested_gloss and len(new_gloss_data.glosses) > 0: translate_raw_tags(new_gloss_data) page_data[-1].senses.append(new_gloss_data) diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index 32a17ac6..46fdc9c7 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -115,7 +115,7 @@ def test_soft_redirect_zh_see(self): "lang_code": "zh", "pos": "soft-redirect", "redirects": ["別個"], - 'senses': [{'tags': ['no-gloss']}], + "senses": [{"tags": ["no-gloss"]}], "word": "別个", } ], @@ -135,7 +135,7 @@ def test_soft_redirect_ja_see(self): "lang_code": "ja", "pos": "soft-redirect", "redirects": ["如月", "二月", "更衣", "衣更着"], - 'senses': [{'tags': ['no-gloss']}], + "senses": [{"tags": ["no-gloss"]}], "word": "きさらぎ", } ], @@ -245,3 +245,25 @@ def test_two_label_topics(self): } ], ) + + def test_empty_parent_gloss(self): + self.wxr.wtp.start_page("bright") + self.wxr.wtp.add_page("Template:lb", 10, "({{{2}}})") + root = self.wxr.wtp.parse("""# {{lb|en|比喻义}} +## [[显然]]的,[[显眼]]的 +## {{lb|en|指颜色}} [[鲜亮]]的,[[鲜艳]]的""") + page_data = [WordEntry(word="", lang_code="", lang="", pos="")] + extract_gloss(self.wxr, page_data, root.children[0], Sense()) + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": ["显然的,显眼的"], + "raw_tags": ["比喻义"], + }, + { + "glosses": ["鲜亮的,鲜艳的"], + "raw_tags": ["比喻义", "指颜色"], + }, + ], + ) From b4940e075943a1300b732cad4e29b213148ae2fa Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 12 Apr 2024 13:27:23 +0800 Subject: [PATCH 2/3] Fix "ruby" fields validation errors in zh edition --- src/wiktextract/extractor/zh/models.py | 20 +++++++++++++++----- tests/test_zh_descendant.py | 2 +- tests/test_zh_linkage.py | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py index 58c14e69..a0ce3cb2 100644 --- a/src/wiktextract/extractor/zh/models.py +++ b/src/wiktextract/extractor/zh/models.py @@ -26,7 +26,9 @@ class Example(ChineseBaseModel): "", description="Source of the sentence, like book title and page number", ) - ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + ruby: list[tuple[str, ...]] = Field( + [], description="Japanese Kanji and furigana" + ) class Sense(ChineseBaseModel): @@ -36,7 +38,9 @@ class Sense(ChineseBaseModel): topics: list[str] = [] categories: list[str] = [] examples: list[Example] = [] - ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + ruby: list[tuple[str, ...]] = Field( + [], description="Japanese Kanji and furigana" + ) class Form(ChineseBaseModel): @@ -44,7 +48,9 @@ class Form(ChineseBaseModel): tags: list[str] = [] raw_tags: list[str] = [] source: str = "" - ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + ruby: list[tuple[str, ...]] = Field( + [], description="Japanese Kanji and furigana" + ) hiragana: str = "" roman: str = "" @@ -87,7 +93,9 @@ class Linkage(ChineseBaseModel): language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field( "", description="Chinese character variant" ) - ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + ruby: list[tuple[str, ...]] = Field( + [], description="Japanese Kanji and furigana" + ) class Descendant(ChineseBaseModel): @@ -98,7 +106,9 @@ class Descendant(ChineseBaseModel): tags: list[str] = [] raw_tags: list[str] = [] descendants: list["Descendant"] = [] - ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + ruby: list[tuple[str, ...]] = Field( + [], description="Japanese Kanji and furigana" + ) class WordEntry(ChineseBaseModel): diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py index 2f53484b..8e4d173c 100644 --- a/tests/test_zh_descendant.py +++ b/tests/test_zh_descendant.py @@ -44,7 +44,7 @@ def test_ruby(self): "lang_code": "ja", "lang": "日語", "roman": "nīhao", - "ruby": [["你好", "ニイハオ"]], + "ruby": [("你好", "ニイハオ")], "word": "你好", }, ) diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py index 5b4852b1..1fb3efc9 100644 --- a/tests/test_zh_linkage.py +++ b/tests/test_zh_linkage.py @@ -65,7 +65,7 @@ def test_ja_r_template(self): page_data[0].synonyms[0].model_dump(exclude_defaults=True), { "roman": "yanushi", - "ruby": [["家", "や"], ["主", "ぬし"]], + "ruby": [("家", "や"), ("主", "ぬし")], "sense": "房東", "word": "家主", }, From 2756cf7d31c510b02f5cdb439bf1cc32f789aa93 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 12 Apr 2024 13:35:05 +0800 Subject: [PATCH 3/3] Don't add empty list text "#" as gloss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are many low quality zh edition pages only have an empty gloss list. Example: "出鍋入火" --- src/wiktextract/extractor/zh/page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py index 6e15860f..9452c373 100644 --- a/src/wiktextract/extractor/zh/page.py +++ b/src/wiktextract/extractor/zh/page.py @@ -149,7 +149,7 @@ def process_pos_block( parse_section(wxr, page_data, base_data, child) else: parse_section(wxr, page_data, base_data, child) - if len(page_data[-1].senses) == 0: + if len(page_data[-1].senses) == 0 and not node.contain_node(NodeKind.LIST): # low quality pages don't put gloss in list gloss_text = clean_node( wxr, page_data[-1], list(node.invert_find_child(LEVEL_KIND_FLAGS))