In [9]:
import json
from typing import List, Dict
from xml.sax import handler, make_parser


In this tas we select *Afrikaans* language and consider the file *afwiktionary-20200301-pages-articles-multistream.xml.bz2*  in https://dumps.wikimedia.org/afwiktionary/20200301/

In [11]:
class AfrikaansSynonymGenerator(handler.ContentHandler):
    PAGE = 'page'
    TITLE = 'title'
    TEXT = 'text'
    REVISION = 'revision'

    SYNONYM_TAG = "{{sinonieme}}"
    AFTER_SYNONYM_TAG = "{{"

    START_TAG = '[['
    END_TAG = ']]'

    def __init__(self):
        handler.ContentHandler.__init__(self)
        self.path = []
        self.cur_title: str = None
        self.__start_parse_synonym = False
        self.__cur_synonym_lines: List[str] = []
        self.__synonym_dict: Dict[str, List[str]] = dict()

    def get_synonyms_dict(self):
        return self.__synonym_dict

    def startElement(self, name, attrs):
        self.path.append(name)

    def endElement(self, name):
        if self.__is_text_in_revision_in_page():
            if self.__start_parse_synonym:
                self.__finish_parse_synonym()
        self.path.pop()

    def characters(self, content):
        if self.__is_title_in_page():
            self.cur_title = content

        if self.__is_text_in_revision_in_page():
            if self.SYNONYM_TAG in content:
                self.__start_parse_synonym = True
            else:
                if self.__start_parse_synonym:
                    if self.AFTER_SYNONYM_TAG in content:
                        self.__finish_parse_synonym()
                    else:
                        self.__cur_synonym_lines.append(content)

    def __finish_parse_synonym(self):
        words = []
        for line in self.__cur_synonym_lines:
            words += self.__acquire_words_from_line(line)

        if self.cur_title is not None and len(words) > 0:
            self.__synonym_dict[self.cur_title] = words

        self.__cur_synonym_lines = []
        self.__start_parse_synonym = False

    def __acquire_words_from_line(self, line: str):
        text = line.strip("#").strip()
        if len(text) == 0:
            return []

        words = []
        start_pos = 0
        while start_pos < len(text):
            start_pos = text.find(self.START_TAG, start_pos)
            if start_pos == -1:
                break

            end_pos = text.find(self.END_TAG, start_pos)
            if end_pos == -1:
                break

            word = text[start_pos + 2: end_pos]
            if len(word) > 0:
                words.append(word)

            start_pos = end_pos + 2

        return words

    def getParentElement(self):
        if len(self.path) == 0:
            return None
        return self.path[-1]

    def __is_title_in_page(self):
        if len(self.path) < 2:
            return False

        if self.path[-1] != self.TITLE:
            return False

        return self.path[-2] == self.PAGE

    def __is_text_in_revision_in_page(self):
        if len(self.path) < 3:
            return False

        if self.path[-1] != self.TEXT:
            return False

        if self.path[-2] != self.REVISION:
            return False

        return self.path[-3] == self.PAGE


In [13]:
!ls data

afwiktionary-20200301-pages-articles-multistream.xml
eswiktionary-20200301-pages-articles-multistream.xml
official-2014.combined-withalt.m2


In [14]:
file = "data/afwiktionary-20200301-pages-articles-multistream.xml"

Let's parse words with their synonyms from this file

In [16]:
parser = make_parser()
synonym_generator = AfrikaansSynonymGenerator()
parser.setContentHandler(synonym_generator)
parser.parse(file)

In [17]:
synonyms = synonym_generator.get_synonyms_dict()

In [18]:
synonyms

{'selfstandige naamwoord': ['substantief'],
 'werkwoord': ['verbum'],
 'byvoeglike naamwoord': ['adjektief'],
 'Noors': ['Noorweegs'],
 'sider': ['appelwyn'],
 'amper': ['byna'],
 'aarde': ['grond'],
 'kruistog': ['kruisvaart'],
 'reënboog': ['reentboog'],
 'hond': ['brak'],
 'reentboog': ['reënboog'],
 'substantief': ['zelfstandig naamwoord', 'nomen'],
 'verlede deelwoord': ['voltooide deelwoord'],
 'voltooide deelwoord': ['verlede deelwoord'],
 'appelwyn': ['sider'],
 'pluralis': ['meervoud'],
 'bywoord': ['adverbium'],
 'moeder': ['ma'],
 'vader': ['pa'],
 'broer': ['broeder', 'boet'],
 'far': ['fader'],
 'fader': ['far'],
 'bror': ['broder'],
 'broder': ['bror'],
 'man': ['himen'],
 'Xhosa': ['Kôsa'],
 'afrikáner': ['bóer'],
 'afrikander': ['bóer'],
 'zulu': ['zulù'],
 'dag': ['etmaal'],
 'herero': ['hereró'],
 'Mann': ['Ehemann', 'Gatte', 'Gemahl'],
 'mahdum': ['oğul'],
 'week': ['sennight'],
 'funksiewoord': ['werkwoord', 'verbum'],
 'verbum': ['werkwoord'],
 'soortnaam': ['spesi

Save the reuslt into file

In [None]:
out_file = 