In [1]:
import re
import xml.sax

In [2]:
articles_filename = './data/skwiktionary-20200301-pages-articles.xml'

In [3]:
class ArticlesHandler(xml.sax.ContentHandler):
    LANG_PATTERN = '== Slovenčina =='
    SYN_PATTERN = '==== Synonymá ===='
    SYN_REGEXP = re.compile(".\[\[(\w+)\]\].?")

    def __init__(self):
        self.current_data = ""
        self.title = ""
        self.text = []
        self.synonym_list = []

    def _find_synonyms(self):
        lang_detected = False
        syn_detected = False

        synonym_lines = []
        for line in self.text:
            #print(line)
            if line == self.LANG_PATTERN:
                lang_detected = True
            elif lang_detected and line == self.SYN_PATTERN:
                syn_detected = True
            elif lang_detected and syn_detected and line.startswith('=='):
                lang_detected = False
                syn_detected = False
                break
            elif lang_detected and syn_detected:
                synonym_lines.append(line)

        return self.SYN_REGEXP.findall(' '.join(synonym_lines))

    # Call when an element starts
    def startElement(self, tag, attributes):
        self.current_data = tag

    # Call when an elements ends
    def endElement(self, tag):
        if self.current_data == 'text':
            synonyms = self._find_synonyms()
            if synonyms:
                self.synonym_list.append(list(set(synonyms + [self.title])))
        self.current_data = ""
        self.text = []

    # Call when a character is read
    def characters(self, content):
        if self.current_data == "title":
            self.title = content
        elif self.current_data == 'text':
            content = content.strip()
            if content:
                self.text.append(content)
        

In [4]:
def parse_articles(filename):
    parser = xml.sax.make_parser()
    # turn off namepsaces
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)

    # override the default ContextHandler
    handler = ArticlesHandler()
    parser.setContentHandler(handler)

    parser.parse(filename)

    # shitty code FTW
    return handler.synonym_list

In [5]:
synonym_list = parse_articles(articles_filename)

In [6]:
len(synonym_list)

531

In [7]:
synonym_list

[['mať', 'pôvod', 'matka', 'mater', 'mamka', 'mama', 'mamička'],
 ['abatiša', 'opátka'],
 ['abiturient', 'maturant'],
 ['ablácia', 'denudácia', 'odnos'],
 ['mimoriadnosť',
  'abnormálnosť',
  'pomätenosť',
  'nezvyčajnosť',
  'úchylnosť',
  'nenormálnosť'],
 ['abonent', 'předplatitel'],
 ['abortovať', 'potratiť'],
 ['abortácia', 'interrupcia', 'abortus', 'výškrab', 'potrat'],
 ['zdrobnenina', 'deminutívum'],
 ['prológ', 'úvod', 'predslov'],
 ['oplzlosť', 'frivolnosť', 'necudnosť'],
 ['tata', 'otec', 'papa', 'ocko', 'otecko', 'apa', 'tatko', 'tato'],
 ['zdraviť', 'vítať'],
 ['i', 'aj', 'a', 'plus', 'ale'],
 ['o', 'na', 'k', 'v', 'po', 'ohľadom', 'ohľadne'],
 ['existovať', 'jestvovať', 'byť'],
 ['urobiť', 'robiť', 'vyrobiť'],
 ['vlastniť', 'mať'],
 ['nadpriemerný',
  'ozrutný',
  'dospelý',
  'mnohopočetný',
  'priveľký',
  'veľkolepý',
  'masívny',
  'vplyvný',
  'významný',
  'vysoký',
  'silný',
  'veľký',
  'dôležitý',
  'extrémny',
  'obrovský',
  'dlhý',
  'nadšený',
  'početný',
 