Skip to content

Commit

Permalink
Merge pull request #817 from xxyzz/ja
Browse files Browse the repository at this point in the history
[ja] extract accent data from pronunciation section templates
  • Loading branch information
xxyzz authored Sep 13, 2024
2 parents a8b485d + db41e10 commit dcbb5f5
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 5 deletions.
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/ja/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def extract_header_nodes(
):
continue
if isinstance(node, HTMLNode) and node.tag == "small":
raw_tags.append(clean_node(wxr, None, node).strip("(): "))
raw_tag = clean_node(wxr, None, node).strip("(): ")
if raw_tag != "又は" and raw_tag not in raw_tags:
# ignore "又は"(or) in "ja-noun" template
raw_tags.append(raw_tag)
else:
form_text = clean_node(wxr, None, node).strip("()【】 ")
add_form_data(
Expand Down
31 changes: 30 additions & 1 deletion src/wiktextract/extractor/ja/sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ def process_sound_template(
clean_node(wxr, cats, template_node)


JA_PRON_ACCENTS = {
"中高型": "Nakadaka",
"平板型": "Heiban",
"頭高型": "Atamadaka",
"尾高型": "Odaka",
}


def process_ja_pron_template(
wxr: WiktextractContext,
template_node: TemplateNode,
Expand All @@ -112,6 +120,10 @@ def process_ja_pron_template(
sound.roman = clean_node(wxr, None, span_tag)
elif "Jpan" in span_classes:
sound.form = clean_node(wxr, None, span_tag)
for link_node in list_item.find_child(NodeKind.LINK):
link_text = clean_node(wxr, None, link_node)
if link_text in JA_PRON_ACCENTS:
sound.tags.append(JA_PRON_ACCENTS[link_text])
if len(sound.model_dump(exclude_defaults=True)) > 0:
sounds.append(sound)

Expand All @@ -125,6 +137,14 @@ def process_ja_pron_template(
sounds.append(sound)


JA_ACCENT_COMMON_TYPES = {
"h": "Heiban",
"a": "Atamadaka",
"n": "Nakadaka",
"o": "Odaka",
}


def process_ja_accent_common_template(
wxr: WiktextractContext,
template_node: TemplateNode,
Expand All @@ -139,8 +159,17 @@ def process_ja_accent_common_template(
raw_tag = clean_node(wxr, None, link_node)
if raw_tag != "":
sound.raw_tags.append(raw_tag)
break
for span_tag in expanded_node.find_html_recursively("span"):
sound.form = clean_node(wxr, None, span_tag)
span_text = clean_node(wxr, None, span_tag)
if len(span_text) > 0:
sound.form = span_text
break
accent_type = clean_node(
wxr, None, template_node.template_parameters.get(1, "")
)
if accent_type in JA_ACCENT_COMMON_TYPES:
sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type])
if sound.form != "":
sounds.append(sound)

Expand Down
19 changes: 19 additions & 0 deletions tests/test_ja_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,22 @@ def test_en_verb(self):
)
],
)

def test_ja_noun_small_tag(self):
self.wxr.wtp.start_page("金玉")
self.wxr.wtp.add_page(
"テンプレート:ja-noun",
10,
"""<strong class="Jpan headword" lang="ja">[[金#日本語|金]][[玉#日本語|玉]]</strong> (<span class="headword-tr manual-tr tr" dir="ltr">きんぎょく</span> <i><small><small>又は</small></small></i> <span class="headword-tr manual-tr tr" dir="ltr">きんたま</span> <i><small><small>又は</small></small></i> <span class="headword-tr manual-tr tr" dir="ltr">かねだま</span>)""",
)
data = WordEntry(lang="日本語", lang_code="ja", word="金玉")
root = self.wxr.wtp.parse("{{ja-noun|きんぎょく|きんたま|かねだま}}")
extract_header_nodes(self.wxr, data, root.children)
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in data.forms],
[
{"form": "きんぎょく"},
{"form": "きんたま"},
{"form": "かねだま"},
],
)
36 changes: 33 additions & 3 deletions tests/test_ja_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,12 @@ def test_ja_pron(self):
self.assertEqual(
data.sounds[:2],
[
Sound(roman="[nìhóńgó]", form="にほんご", raw_tags=["東京式"]),
Sound(
roman="[nìhóńgó]",
form="にほんご",
raw_tags=["東京式"],
tags=["Heiban"],
),
Sound(ipa="[ɲ̟ihõ̞ŋɡo̞]"),
],
)
Expand Down Expand Up @@ -117,7 +122,7 @@ def test_wiki_link_homophones(self):
data = base_data.model_dump(exclude_defaults=True)
self.assertEqual(data["sounds"], [{"homophones": ["lis/lys"]}])

def test_ja_pron_template(self):
def test_ja_accent_common_template(self):
self.wxr.wtp.start_page("豆乳")
self.wxr.wtp.add_page(
"テンプレート:ja-accent-common",
Expand All @@ -131,7 +136,32 @@ def test_ja_pron_template(self):
extract_sound_section(self.wxr, page_data, base_data, root.children[0])
data = base_data.model_dump(exclude_defaults=True)
self.assertEqual(
data["sounds"], [{"form": "とーにゅー", "raw_tags": ["京阪式"]}]
data["sounds"],
[
{
"form": "とーにゅー",
"tags": ["Heiban"],
"raw_tags": ["京阪式"],
}
],
)

def test_ja_accent_common_template_two_span_tags(self):
self.wxr.wtp.start_page("まぜる")
self.wxr.wtp.add_page(
"テンプレート:ja-accent-common",
10,
"""*([[w:京阪式アクセント|京阪式]])&nbsp;<span lang="ja" xml:lang="ja">まぜ<span>る</span></span>""",
)
root = self.wxr.wtp.parse("""===発音===
{{ja-accent-common|region=京阪|h|まぜ|る}}""")
base_data = WordEntry(word="まぜる", lang_code="ja", lang="日本語")
page_data = [base_data.model_copy(deep=True)]
extract_sound_section(self.wxr, page_data, base_data, root.children[0])
data = base_data.model_dump(exclude_defaults=True)
self.assertEqual(
data["sounds"],
[{"form": "まぜる", "tags": ["Heiban"], "raw_tags": ["京阪式"]}],
)

def test_magic_word_in_template_param(self):
Expand Down

0 comments on commit dcbb5f5

Please sign in to comment.