Skip to content

Commit

Permalink
Merge pull request #582 from xxyzz/zh
Browse files Browse the repository at this point in the history
Fix pydantic errors and remove empty glosses for zh edition
  • Loading branch information
xxyzz committed Apr 12, 2024
2 parents 0ca46a6 + 2756cf7 commit ee658f9
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 12 deletions.
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/zh/gloss.py
Expand Up @@ -49,7 +49,8 @@ def extract_gloss(
gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
new_gloss_data = gloss_data.model_copy(deep=True)
new_gloss_data.raw_tags.extend(raw_tags)
new_gloss_data.glosses.append(gloss_text)
if len(gloss_text) > 0:
new_gloss_data.glosses.append(gloss_text)
if len(ruby_data) > 0:
new_gloss_data.ruby = ruby_data

Expand All @@ -62,6 +63,6 @@ def extract_gloss(
else: # example list
extract_examples(wxr, new_gloss_data, child_node)

if not has_nested_gloss:
if not has_nested_gloss and len(new_gloss_data.glosses) > 0:
translate_raw_tags(new_gloss_data)
page_data[-1].senses.append(new_gloss_data)
20 changes: 15 additions & 5 deletions src/wiktextract/extractor/zh/models.py
Expand Up @@ -26,7 +26,9 @@ class Example(ChineseBaseModel):
"",
description="Source of the sentence, like book title and page number",
)
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
ruby: list[tuple[str, ...]] = Field(
[], description="Japanese Kanji and furigana"
)


class Sense(ChineseBaseModel):
Expand All @@ -36,15 +38,19 @@ class Sense(ChineseBaseModel):
topics: list[str] = []
categories: list[str] = []
examples: list[Example] = []
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
ruby: list[tuple[str, ...]] = Field(
[], description="Japanese Kanji and furigana"
)


class Form(ChineseBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []
source: str = ""
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
ruby: list[tuple[str, ...]] = Field(
[], description="Japanese Kanji and furigana"
)
hiragana: str = ""
roman: str = ""

Expand Down Expand Up @@ -87,7 +93,9 @@ class Linkage(ChineseBaseModel):
language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field(
"", description="Chinese character variant"
)
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
ruby: list[tuple[str, ...]] = Field(
[], description="Japanese Kanji and furigana"
)


class Descendant(ChineseBaseModel):
Expand All @@ -98,7 +106,9 @@ class Descendant(ChineseBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
descendants: list["Descendant"] = []
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
ruby: list[tuple[str, ...]] = Field(
[], description="Japanese Kanji and furigana"
)


class WordEntry(ChineseBaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/page.py
Expand Up @@ -149,7 +149,7 @@ def process_pos_block(
parse_section(wxr, page_data, base_data, child)
else:
parse_section(wxr, page_data, base_data, child)
if len(page_data[-1].senses) == 0:
if len(page_data[-1].senses) == 0 and not node.contain_node(NodeKind.LIST):
# low quality pages don't put gloss in list
gloss_text = clean_node(
wxr, page_data[-1], list(node.invert_find_child(LEVEL_KIND_FLAGS))
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zh_descendant.py
Expand Up @@ -44,7 +44,7 @@ def test_ruby(self):
"lang_code": "ja",
"lang": "日語",
"roman": "nīhao",
"ruby": [["你好", "ニイハオ"]],
"ruby": [("你好", "ニイハオ")],
"word": "你好",
},
)
Expand Down
26 changes: 24 additions & 2 deletions tests/test_zh_gloss.py
Expand Up @@ -115,7 +115,7 @@ def test_soft_redirect_zh_see(self):
"lang_code": "zh",
"pos": "soft-redirect",
"redirects": ["別個"],
'senses': [{'tags': ['no-gloss']}],
"senses": [{"tags": ["no-gloss"]}],
"word": "別个",
}
],
Expand All @@ -135,7 +135,7 @@ def test_soft_redirect_ja_see(self):
"lang_code": "ja",
"pos": "soft-redirect",
"redirects": ["如月", "二月", "更衣", "衣更着"],
'senses': [{'tags': ['no-gloss']}],
"senses": [{"tags": ["no-gloss"]}],
"word": "きさらぎ",
}
],
Expand Down Expand Up @@ -245,3 +245,25 @@ def test_two_label_topics(self):
}
],
)

def test_empty_parent_gloss(self):
self.wxr.wtp.start_page("bright")
self.wxr.wtp.add_page("Template:lb", 10, "({{{2}}})")
root = self.wxr.wtp.parse("""# {{lb|en|比喻义}}
## [[显然]]的,[[显眼]]的
## {{lb|en|指颜色}} [[鲜亮]]的,[[鲜艳]]的""")
page_data = [WordEntry(word="", lang_code="", lang="", pos="")]
extract_gloss(self.wxr, page_data, root.children[0], Sense())
self.assertEqual(
page_data[0].model_dump(exclude_defaults=True)["senses"],
[
{
"glosses": ["显然的,显眼的"],
"raw_tags": ["比喻义"],
},
{
"glosses": ["鲜亮的,鲜艳的"],
"raw_tags": ["比喻义", "指颜色"],
},
],
)
2 changes: 1 addition & 1 deletion tests/test_zh_linkage.py
Expand Up @@ -65,7 +65,7 @@ def test_ja_r_template(self):
page_data[0].synonyms[0].model_dump(exclude_defaults=True),
{
"roman": "yanushi",
"ruby": [["家", "や"], ["主", "ぬし"]],
"ruby": [("家", "や"), ("主", "ぬし")],
"sense": "房東",
"word": "家主",
},
Expand Down

0 comments on commit ee658f9

Please sign in to comment.