From 842a3646f42243462c9a4e5e6141814316c1ebb8 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 23 Jul 2024 17:22:01 +0800 Subject: [PATCH] [pl] translate some tags and topics data start with "a" and b" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in page "Pomoc:Skróty używane w Wikisłowniku" --- src/wiktextract/extractor/pl/pos.py | 5 +++ src/wiktextract/extractor/pl/tags.py | 62 ++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 src/wiktextract/extractor/pl/tags.py diff --git a/src/wiktextract/extractor/pl/pos.py b/src/wiktextract/extractor/pl/pos.py index 16b74ef0..198af5eb 100644 --- a/src/wiktextract/extractor/pl/pos.py +++ b/src/wiktextract/extractor/pl/pos.py @@ -5,10 +5,12 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .models import Sense, WordEntry +from .tags import translate_raw_tags POS_DATA = { "rzeczownik": {"pos": "noun"}, "czasownik": {"pos": "verb"}, + "przyimek": {"pos": "prep"}, "przymiotnik": {"pos": "adj"}, "przyrostek": {"pos": "suffix", "tags": ["morpheme"]}, "wrostek": {"pos": "infix", "tags": ["morpheme"]}, @@ -19,11 +21,13 @@ "spójnik": {"pos": "conj"}, "określnik": {"pos": "det"}, "międzyrostek": {"pos": "interfix", "tags": ["morpheme"]}, + "morfem": {"pos": "unknown", "tags": ["morpheme"]}, "wykrzyknik": {"pos": "intj"}, "symbol": {"pos": "symbol"}, "liczebnik": {"pos": "num"}, "partykuła": {"pos": "particle"}, "skrótowiec": {"pos": "abbrev", "tags": ["abbreviation"]}, + "zaimek": {"pos": "pron"}, } @@ -83,4 +87,5 @@ def process_gloss_list_item( sense.raw_tags = raw_tags sense.sense_index = sense_index sense.glosses.append(gloss_text) + translate_raw_tags(sense) word_entry.senses.append(sense) diff --git a/src/wiktextract/extractor/pl/tags.py b/src/wiktextract/extractor/pl/tags.py new file mode 100644 index 00000000..ca5ccf9e --- /dev/null +++ b/src/wiktextract/extractor/pl/tags.py @@ -0,0 +1,62 @@ +from .models import WordEntry + +# Help:Abbreviations used in Wiktionary +# https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku +# Category:Shortcut templates +# https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów +TAGS = { + "abl.": "ablative", + # "akust.": "", + "aor.": "aorist", + "bezok.": "infinitive", + "bezosob.": "impersonal", + "bibl.": "Biblical", + "blm": "no-plural", + "blp": "no-singulative", + "Bm": "Bokmål", + "bośn.": "Bosnian", + "brytań.": "British", + "bułg.": "Bulgarian", + "bwr.": "Bavarian", +} + +TOPICS = { + "adm.": "administration", + "agrot.": "agrotechnology", + "alch.": "alchemy", + "anat.": "anatomy", + "antrop.": "anthropology", + "arachn.": "arachnology", + "archit.": "architecture", + "archeol.": "archeology", + "astr.": "astronomy", + "astrol.": "astrology", + "astronaut.": "astronautics", + "bank.": "banking", + # "bibliot.": "", + "biochem.": "biochemistry", + "biol.": "biology", + # "biur.": "", + "bot.": "botany", + "bud.": "construction", +} + + +def translate_raw_tags(data: WordEntry) -> None: + raw_tags = [] + for raw_tag in data.raw_tags: + if raw_tag in TAGS and hasattr(data, "tags"): + tag = TAGS[raw_tag] + if isinstance(tag, str): + data.tags.append(tag) + elif isinstance(tag, list): + data.tags.extend(tag) + elif raw_tag in TOPICS and hasattr(data, "topics"): + topic = TOPICS[raw_tag] + if isinstance(topic, str): + data.topics.append(topic) + elif isinstance(topic, list): + data.topics.extend(topic) + else: + raw_tags.append(raw_tag) + data.raw_tags = raw_tags