Skip to content

Commit

Permalink
Merge pull request #629 from xxyzz/fr
Browse files Browse the repository at this point in the history
[fr] remove "désuet" tag template from gloss text
  • Loading branch information
xxyzz committed May 14, 2024
2 parents 9218374 + 1d1c3c2 commit 8203a16
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 15 deletions.
52 changes: 40 additions & 12 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from collections import defaultdict
from typing import Optional, Union

Expand Down Expand Up @@ -75,14 +74,11 @@ def extract_gloss(
and gloss_only_nodes[index].template_name == "note"
):
note_index = index
find_alt_of_form(
gloss_text = find_alt_of_form(
wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data
)
if "form-of" in page_data[-1].tags:
find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes[:note_index])
if not (gloss_text.startswith("(") and gloss_text.endswith(")")):
gloss_text = gloss_text.strip(" ()")
if gloss_text != "":
gloss_data.glosses.append(gloss_text)
gloss_data.note = clean_node(
Expand Down Expand Up @@ -177,24 +173,45 @@ def find_alt_of_form(
gloss_nodes: list[Union[str, WikiNode]],
pos_type: str,
gloss_data: Sense,
):
) -> str:
"""
Return gloss text, remove tag template expanded from "variante *" templates.
"""

alt_of = ""
for template_node in filter(
lambda n: isinstance(n, TemplateNode), gloss_nodes
):
filtered_gloss_nodes = []
for gloss_node in gloss_nodes:
# https://fr.wiktionary.org/wiki/Modèle:variante_de
# https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
if re.fullmatch(r"variante \w*\s*de", template_node.template_name):
if isinstance(
gloss_node, TemplateNode
) and gloss_node.template_name.startswith("variante "):
alt_of = clean_node(
wxr, None, template_node.template_parameters.get("dif", "")
wxr, None, gloss_node.template_parameters.get("dif", "")
)
if len(alt_of) == 0:
alt_of = clean_node(
wxr, None, template_node.template_parameters.get(1, "")
wxr, None, gloss_node.template_parameters.get(1, "")
)
if len(alt_of) > 0:
gloss_data.alt_of.append(AltForm(word=alt_of))
gloss_data.tags.append("alt-of")
expanded_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(gloss_node),
pre_expand=True,
additional_expand={gloss_node.template_name},
)
for node in expanded_template.children:
if (
isinstance(node, TemplateNode)
and node.template_name == "désuet"
):
raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
gloss_data.raw_tags.append(raw_tag)
else:
filtered_gloss_nodes.append(node)
else:
filtered_gloss_nodes.append(gloss_node)

if alt_of == "" and pos_type == "typographic variant":
for gloss_node in filter(
Expand All @@ -212,6 +229,17 @@ def find_alt_of_form(
if len(alt_of) > 0:
gloss_data.alt_of.append(AltForm(word=alt_of))

gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
brackets = 0
for char in gloss_text:
if char == "(":
brackets += 1
elif char == ")":
brackets -= 1
if brackets != 0:
gloss_text = gloss_text.strip(" ()")
return gloss_text


def find_form_of_word(
wxr: WiktextractContext,
Expand Down
17 changes: 14 additions & 3 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,17 @@ def test_variante_kyujitai_de(self):
self.wxr.wtp.add_page(
"Modèle:variante kyujitai de",
10,
"(Désuet) Orthographe en kyūjitai de 万歳 (« vive ! »)",
"{{désuet|ja}} ''Orthographe en [[kyūjitai]] de'' {{lien|{{{1}}}|ja|{{{2|}}}|dif={{{dif|}}}|tr={{{tr|}}}|sens={{{sens|}}}}}",
)
self.wxr.wtp.add_page(
"Modèle:désuet",
10,
'<span class="emploi"><span id="désuet"></span>''(<span class="texte">[[Annexe:Glossaire grammatical#D|Désuet]]</span>)''</span>[[Catégorie:Termes désuets en japonais]]',
)
self.wxr.wtp.add_page(
"Modèle:lien",
10,
'<bdi lang="ja" xml:lang="ja" class="lang-ja">[[万歳#ja|万歳]]</bdi> («&nbsp;[[vive#fr-interj|vive]] !&nbsp;»)',
)
self.assertEqual(
parse_page(
Expand All @@ -468,9 +478,10 @@ def test_variante_kyujitai_de(self):
"senses": [
{
"alt_of": [{"word": "万歳"}],
"tags": ["alt-of"],
"categories": ["Termes désuets en japonais"],
"tags": ["alt-of", "obsolete"],
"glosses": [
"(Désuet) Orthographe en kyūjitai de 万歳 (« vive ! »)"
"Orthographe en kyūjitai de 万歳 (« vive ! »)"
],
}
],
Expand Down

0 comments on commit 8203a16

Please sign in to comment.