Skip to content

Commit

Permalink
Merge pull request #661 from xxyzz/fr
Browse files Browse the repository at this point in the history
[fr] translate some tags and topics in gloss and sound fields
  • Loading branch information
xxyzz committed Jun 4, 2024
2 parents e9e0a99 + 8638ff5 commit 52de4fa
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 25 deletions.
15 changes: 10 additions & 5 deletions src/wiktextract/extractor/fr/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
from .tags import translate_raw_tags


def extract_pronunciation(
Expand Down Expand Up @@ -85,12 +86,12 @@ def process_pron_list_item(
current_raw_tags.append(clean_node(wxr, None, list_item_child))
elif list_item_child.kind == NodeKind.LINK:
for span_tag in list_item_child.find_html_recursively("span"):
sounds_list.append(
Sound(
ipa=clean_node(wxr, None, span_tag),
raw_tags=current_raw_tags[:],
)
sound = Sound(
ipa=clean_node(wxr, None, span_tag),
raw_tags=current_raw_tags[:],
)
translate_raw_tags(sound)
sounds_list.append(sound)
elif isinstance(list_item_child, str):
if ":" in list_item_child:
after_colon = True
Expand All @@ -100,6 +101,7 @@ def process_pron_list_item(
if len(pron_text) > 0:
sound = Sound(raw_tags=current_raw_tags[:])
setattr(sound, pron_key, pron_text)
translate_raw_tags(sound)
sounds_list.append(sound)

for nest_list_item in list_item_node.find_child_recursively(
Expand Down Expand Up @@ -159,6 +161,7 @@ def process_pron_template(
prons.add(pron_text)
sound = Sound(raw_tags=raw_tags[:])
setattr(sound, use_key, pron_text)
translate_raw_tags(sound)
sounds_list.append(sound)
return sounds_list

Expand Down Expand Up @@ -193,6 +196,7 @@ def process_ecouter_template(
sound.ipa = ipa
if len(audio_file) > 0:
set_sound_file_url_fields(wxr, audio_file, sound)
translate_raw_tags(sound)
return sound


Expand Down Expand Up @@ -227,6 +231,7 @@ def process_pron_rimes_template(
sound.rhymes = span_text
if len(raw_tags) > 0:
sound.raw_tags = raw_tags[:]
translate_raw_tags(sound)
return sound


Expand Down
53 changes: 51 additions & 2 deletions src/wiktextract/extractor/fr/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@
SENSE_TAGS: dict[str, str] = {
# https://fr.wiktionary.org/wiki/Modèle:figuré
# https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_relation_entre_les_définitions
# Catégorie:Modèles de genre textuel
# Catégorie:Modèles de registre
"sens figuré": "figuratively",
"enclise": "enclitic",
"idiotisme": "idiomatic",
Expand All @@ -161,10 +163,21 @@
"argot": "slang",
"rare": "rare",
"plus rare": "rare",
"familier": "colloquial",
"par extension": "broadly",
"en particulier": "especially",
"informel": "informal",
"littéraire": "literary", # Modèle:littéraire
"poétique": "poetic", # Modèle:poétique
# "didactique": "", # Modèle:didactique
"soutenu": "formal", # Modèle:soutenu
"informel": "informal", # Modèle:informel
"familier": "familiar", # Modèle:familier
"très familier": "very-familiar", # Modèle:très familier
# "populaire": "", # Modèle:populaire
"vulgaire": "vulgar", # Modèle:vulgaire
"langage enfantin": "childish", # Modèle:enfantin
# Catégorie:Modèles de thématique
"anglicisme informatique": "Anglicism",
"proverbe": "proverb",
}

# https://en.wikipedia.org/wiki/Voice_(grammar)
Expand Down Expand Up @@ -197,6 +210,41 @@
"analyse": "analytic",
}

# Template:cmn-pron
# https://fr.wiktionary.org/wiki/自由
ZH_PRON_TAGS = {
"pinyin": "Pinyin",
"efeo": "EFEO", # https://en.wikipedia.org/wiki/EFEO_Chinese_transcription
"wade-giles": "Wade-Giles",
"yale": "Yale",
"zhuyin": "bopomofo",
"mandarin": "Mandarin",
"cantonais": "Cantonese",
"cantonais (yue)": "Cantonese",
"jyutping": "Jyutping",
"hakka": "Hakka",
"pha̍k-fa-sṳ": "Phak-fa-su",
"meixian, guangdong": ["Meixian", "Guangdong"],
"jin": "Jin",
"mindong": "Eastern-Min",
# https://en.wikipedia.org/wiki/Bàng-uâ-cê
"bàng-uâ-cê (fuzhou)": ["Bang-ua-ce", "Fuzhou"],
"minnan": "Min",
"pe̍h-ōe-jī (hokkien : fujian, taïwan)": [
"Peh-oe-ji",
"Hokkien",
"Fujian",
"Taiwan",
],
"chaozhou, peng'im": ["Chaozhou", "Peng'im"],
"wu": "Wu",
"shanghai": "Shanghai",
"chinois médiéval": "Medieval-Chinese",
"chinois archaïque": "Old-Chinese",
"baxter-sagart": "Baxter-Sagart",
"zhengzhang": "Zhengzhang",
}

GRAMMATICAL_TAGS: dict[str, Union[str, list[str]]] = {
**GENDER_TAGS,
**NUMBER_TAGS,
Expand All @@ -214,6 +262,7 @@
**SENSE_TAGS,
**VOICE_TAGS,
**LEXIQUE_TAGS,
**ZH_PRON_TAGS,
}


Expand Down
21 changes: 21 additions & 0 deletions src/wiktextract/extractor/fr/topics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# https://fr.wiktionary.org/wiki/Annexe:Glossaire_grammatical
# https://fr.wiktionary.org/wiki/Module:lexique/data
# Catégorie:Modèles de thématique

TOPIC_TAGS: dict[str, str] = {
"botanique": "botany",
Expand Down Expand Up @@ -62,6 +63,7 @@
"chimie": "chemistry",
"chirurgie": "surgery",
"christianisme": "Christianity",
"cinéma": "film",
"cirque": "circus",
"climatologie": "climatology",
"coiffure": "hairdressing",
Expand Down Expand Up @@ -277,4 +279,23 @@
"boulangerie": "cooking",
"bourse": "finance",
"brasserie": "beverages manufacturing",
# Catégorie:Modèles de thématique
"phycologie": "phycology",
"bière": "beer",
"chronologie": "chronology",
"dialectologie": "dialectology",
"média": "media",
"état": "state",
"mobilier": "furniture",
"science": "science",
"boisson": "beverages",
"vêtement": "clothing",
"électronique": "electricity",
"carte à jouer": "card-games",
"lgbt": "LGBT",
"jeu vidéo, minecraft": "video-games",
"réseaux": "networking",
"science-fiction, univers de star wars": "science-fiction",
"textile": "textiles",
"transport": "transport",
}
4 changes: 3 additions & 1 deletion tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,9 @@ def test_variante_kyujitai_de(self):
self.wxr.wtp.add_page(
"Modèle:désuet",
10,
'<span class="emploi"><span id="désuet"></span>''(<span class="texte">[[Annexe:Glossaire grammatical#D|Désuet]]</span>)''</span>[[Catégorie:Termes désuets en japonais]]',
'<span class="emploi"><span id="désuet"></span>'
'(<span class="texte">[[Annexe:Glossaire grammatical#D|Désuet]]</span>)'
"</span>[[Catégorie:Termes désuets en japonais]]",
)
self.wxr.wtp.add_page(
"Modèle:lien",
Expand Down
34 changes: 17 additions & 17 deletions tests/test_fr_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_str_pron(self):
sound.model_dump(exclude_defaults=True)
for sound in page_data[-1].sounds
],
[{"raw_tags": ["cantonais", "Yale"], "zh_pron": "nei⁵hou²"}],
[{"tags": ["Cantonese", "Yale"], "zh_pron": "nei⁵hou²"}],
)

def test_no_ipa(self):
Expand Down Expand Up @@ -231,21 +231,21 @@ def test_cmn_pron(self):
self.assertEqual(
[s.model_dump(exclude_defaults=True) for s in page_data[0].sounds],
[
{"ipa": "\\t͡su̯ɔ˥\\", "raw_tags": ["mandarin"]},
{"ipa": "\\t͡su̯ɔ˧˥\\", "raw_tags": ["mandarin"]},
{"ipa": "\\t͡su̯ɔ˥˩\\", "raw_tags": ["mandarin"]},
{"zh_pron": "zuō", "raw_tags": ["mandarin", "Pinyin"]},
{"zh_pron": "zuó", "raw_tags": ["mandarin", "Pinyin"]},
{"zh_pron": "zuò", "raw_tags": ["mandarin", "Pinyin"]},
{"zh_pron": "tso", "raw_tags": ["mandarin", "EFEO"]},
{"zh_pron": "tso¹", "raw_tags": ["mandarin", "Wade-Giles"]},
{"zh_pron": "tso²", "raw_tags": ["mandarin", "Wade-Giles"]},
{"zh_pron": "tso⁴", "raw_tags": ["mandarin", "Wade-Giles"]},
{"zh_pron": "dzwō", "raw_tags": ["mandarin", "Yale"]},
{"zh_pron": "dzwó", "raw_tags": ["mandarin", "Yale"]},
{"zh_pron": "dzwò", "raw_tags": ["mandarin", "Yale"]},
{"zh_pron": "ㄗㄨㄛ", "raw_tags": ["mandarin", "Zhuyin"]},
{"zh_pron": "ㄗㄨㄛˊ", "raw_tags": ["mandarin", "Zhuyin"]},
{"zh_pron": "ㄗㄨㄛˋ", "raw_tags": ["mandarin", "Zhuyin"]},
{"ipa": "\\t͡su̯ɔ˥\\", "tags": ["Mandarin"]},
{"ipa": "\\t͡su̯ɔ˧˥\\", "tags": ["Mandarin"]},
{"ipa": "\\t͡su̯ɔ˥˩\\", "tags": ["Mandarin"]},
{"zh_pron": "zuō", "tags": ["Mandarin", "Pinyin"]},
{"zh_pron": "zuó", "tags": ["Mandarin", "Pinyin"]},
{"zh_pron": "zuò", "tags": ["Mandarin", "Pinyin"]},
{"zh_pron": "tso", "tags": ["Mandarin", "EFEO"]},
{"zh_pron": "tso¹", "tags": ["Mandarin", "Wade-Giles"]},
{"zh_pron": "tso²", "tags": ["Mandarin", "Wade-Giles"]},
{"zh_pron": "tso⁴", "tags": ["Mandarin", "Wade-Giles"]},
{"zh_pron": "dzwō", "tags": ["Mandarin", "Yale"]},
{"zh_pron": "dzwó", "tags": ["Mandarin", "Yale"]},
{"zh_pron": "dzwò", "tags": ["Mandarin", "Yale"]},
{"zh_pron": "ㄗㄨㄛ", "tags": ["Mandarin", "bopomofo"]},
{"zh_pron": "ㄗㄨㄛˊ", "tags": ["Mandarin", "bopomofo"]},
{"zh_pron": "ㄗㄨㄛˋ", "tags": ["Mandarin", "bopomofo"]},
],
)

0 comments on commit 52de4fa

Please sign in to comment.