Skip to content

Commit

Permalink
Merge pull request #737 from xxyzz/pl
Browse files Browse the repository at this point in the history
[pl] translate some tags and topics data start with "a" and b"
  • Loading branch information
xxyzz committed Jul 23, 2024
2 parents 0556a6f + 842a364 commit 123b4ad
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/pl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .tags import translate_raw_tags

POS_DATA = {
"rzeczownik": {"pos": "noun"},
"czasownik": {"pos": "verb"},
"przyimek": {"pos": "prep"},
"przymiotnik": {"pos": "adj"},
"przyrostek": {"pos": "suffix", "tags": ["morpheme"]},
"wrostek": {"pos": "infix", "tags": ["morpheme"]},
Expand All @@ -19,11 +21,13 @@
"spójnik": {"pos": "conj"},
"określnik": {"pos": "det"},
"międzyrostek": {"pos": "interfix", "tags": ["morpheme"]},
"morfem": {"pos": "unknown", "tags": ["morpheme"]},
"wykrzyknik": {"pos": "intj"},
"symbol": {"pos": "symbol"},
"liczebnik": {"pos": "num"},
"partykuła": {"pos": "particle"},
"skrótowiec": {"pos": "abbrev", "tags": ["abbreviation"]},
"zaimek": {"pos": "pron"},
}


Expand Down Expand Up @@ -83,4 +87,5 @@ def process_gloss_list_item(
sense.raw_tags = raw_tags
sense.sense_index = sense_index
sense.glosses.append(gloss_text)
translate_raw_tags(sense)
word_entry.senses.append(sense)
62 changes: 62 additions & 0 deletions src/wiktextract/extractor/pl/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from .models import WordEntry

# Help:Abbreviations used in Wiktionary
# https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku
# Category:Shortcut templates
# https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów
TAGS = {
"abl.": "ablative",
# "akust.": "",
"aor.": "aorist",
"bezok.": "infinitive",
"bezosob.": "impersonal",
"bibl.": "Biblical",
"blm": "no-plural",
"blp": "no-singulative",
"Bm": "Bokmål",
"bośn.": "Bosnian",
"brytań.": "British",
"bułg.": "Bulgarian",
"bwr.": "Bavarian",
}

TOPICS = {
"adm.": "administration",
"agrot.": "agrotechnology",
"alch.": "alchemy",
"anat.": "anatomy",
"antrop.": "anthropology",
"arachn.": "arachnology",
"archit.": "architecture",
"archeol.": "archeology",
"astr.": "astronomy",
"astrol.": "astrology",
"astronaut.": "astronautics",
"bank.": "banking",
# "bibliot.": "",
"biochem.": "biochemistry",
"biol.": "biology",
# "biur.": "",
"bot.": "botany",
"bud.": "construction",
}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS and hasattr(data, "tags"):
tag = TAGS[raw_tag]
if isinstance(tag, str):
data.tags.append(tag)
elif isinstance(tag, list):
data.tags.extend(tag)
elif raw_tag in TOPICS and hasattr(data, "topics"):
topic = TOPICS[raw_tag]
if isinstance(topic, str):
data.topics.append(topic)
elif isinstance(topic, list):
data.topics.extend(topic)
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags

0 comments on commit 123b4ad

Please sign in to comment.