In [1]:
import re
import json
import xml.sax

In [2]:
synonyms = {}

class VietnameseSynHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.current_tag = ""
        
        self.parse_page = False
        self.parse_text = False
        self.parse_title = False
        
        self.syn_parse_start = False
        self.syn_parse_stop = False
        
        self.title = ""
        self.syns = []

    def startElement(self, tag, attributes):
        if tag == "page":
            self.parse_page = True
        elif tag == "text":
            self.parse_text = True
        elif tag == "title":
            self.parse_title = True
            
        self.current_tag = tag

    def endElement(self, tag):
        if tag == "page":
            self.parse_page = False
            if len(self.syns) > 0:
                synonyms[self.title] = self.syns
            self.title = ""
            self.syns = []
        elif tag == "text":
            self.parse_text = False
        elif tag == "title":
            self.parse_title = False
            
        self.current_tag = ""


    def characters(self, content):
        if self.parse_page:
            if self.parse_title:
                self.title = content
            elif self.parse_text:
                content = content.strip()
                if re.match(r"{{-syn-}}", content):
                    self.syn_parse_start = True
                    self.syn_parse_stop = False
                elif self.syn_parse_start:
                    l_content = len(content)
                    if l_content > 0:
                        self.syns.extend(re.findall(r"\[\[(.*?)\]\]", content))
                        self.syn_parse_stop = False
                    else:
                        if self.syn_parse_stop:
                            self.syn_parse_start = False
                        else:
                            self.syn_parse_stop = True

In [3]:
%%time

# # create an XMLReader
parser = xml.sax.make_parser()
# parser.setFeature(xml.sax.handler.feature_namespaces, 0)

# override the default ContextHandler
Handler = VietnameseSynHandler()
parser.setContentHandler(Handler)

parser.parse("viwiktionary-20200301-pages-articles-multistream.xml")

CPU times: user 13.7 s, sys: 36.5 ms, total: 13.7 s
Wall time: 13.7 s


In [4]:
with open('synonyms_vi.json', 'w') as file_:
    json.dump(synonyms, file_, sort_keys=True, indent=4, ensure_ascii=False)

In [5]:
print(json.dumps(synonyms, sort_keys=True, indent=4, ensure_ascii=False))

{
    "\"": [
        "nhà lao"
    ],
    "-ment": [
        "-tion",
        "-age",
        "-tion"
    ],
    "-th": [
        "-eth"
    ],
    "-trix": [
        "-tress"
    ],
    "-way": [
        "-wise"
    ],
    "1": [
        "nhất"
    ],
    "4WD": [
        "4x4",
        "FWD",
        "RWD",
        "AWD"
    ],
    "4x4": [
        "4WD"
    ],
    "666": [
        "666kfds[p5w52ngs",
        "65ew6se5666"
    ],
    "AKA": [
        "a.k.a."
    ],
    "Abel": [
        "Aabel",
        "Aapel"
    ],
    "Alex.r": [
        "Alex."
    ],
    "America": [
        "United States of America",
        "U.S.A.",
        "U.S.",
        "Columbia"
    ],
    "Amerika": [
        "nieuwe wereld",
        "Verenigde Staten van Amerika",
        "Verenigde Staten",
        "VS",
        "VSA",
        "Amerikaans",
        "Amerikaan"
    ],
    "Anglo-Saxon": [
        "Old English"
    ],
    "Arab": [
        "Arabian"
    ],
    "August": [
        "Aug",
        "Aug

There are still some thrash from other languages, but currently I don't see any approaches how to filter it. Maybe just use detect_lang tool. But in this case execution will be much slower.