In [34]:
from xml.sax import saxutils, handler, make_parser
import re

def extract_synonyms_section(text):
    synonyms_section = None
    found = False
    
    for line in text.splitlines():
        line = line.strip()
        if line == '{{synonimy}}':
            synonyms_section = []
            found = True
        elif found:
            if line.startswith('{{'):
                break
            else:
                synonyms_section.append(line)
            
    return synonyms_section

polish_letter_regexp = re.compile('[a-zA-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ]')

def extract_synonyms(text):
    section_lines = extract_synonyms_section(text)
    if section_lines:
        synonyms = []
        for line in section_lines:
            raw_synonyms = re.findall('\[\[([^]]+)', line)
            line_synonyms = []
            for s in raw_synonyms:
                if (# text with pipes is most probably not what we want
                    # since it defines custom text to display a link to a word 
                    # also if synonym contains a digit - drop it
                    not re.search('[\\d|]', s) and
                    # should contain at least one adequate letter
                    re.search(polish_letter_regexp, s) and
                    # should start with a letter
                    re.search('^\\w', s)):
                    line_synonyms.append(s)
            if line_synonyms:
                synonyms.append(line_synonyms)
        return synonyms
    return None

class PlWiktionaryHandler(handler.ContentHandler):
    collected_synonyms = {}
    cur_tag = None
    cur_title = ''
    cur_text = ''
    total_pages = 0
    
    def _reset(self):
        self.cur_tag = None
        self.cur_title = ''
        self.cur_text = ''
    
    def __init__(self):
        handler.ContentHandler.__init__(self)
        
    def startElement(self, name, attrs):
        self.cur_tag = name
        
    def endElement(self, name):
        if name == 'page':
            self.total_pages += 1
            self.cur_title = self.cur_title.strip()
            if (self.cur_title and
                # should be one word
                len(self.cur_title.split()) == 1 and
                # should contain at least one adequate letter
                re.search(polish_letter_regexp, self.cur_title) and 
                # drop capitalized words
                self.cur_title[0].islower() and
                # should start with a letter
                re.search('^\\w', self.cur_title) and
                self.cur_text):
                synonyms = extract_synonyms(self.cur_text)
                if synonyms:
                    self.collected_synonyms[self.cur_title] = synonyms
            self._reset()
        
    def characters(self, content):
        if self.cur_tag == 'title':
            self.cur_title += content
        if self.cur_tag == 'text':
            self.cur_text += content

parser = make_parser()
content_handler = PlWiktionaryHandler()
parser.setContentHandler(content_handler)

In [35]:
full_xml_file_path = '/Users/oleh.palianytsia/Downloads/plwiktionary-20200301-pages-articles.xml'
parser.parse(full_xml_file_path)

In [36]:
with open('1-1-synonyms-result.txt', 'w') as f:
    for word, synsets in sorted(content_handler.collected_synonyms.items(), key=lambda item: item[1]):
        f.write(word + ':\n')
        for ss in synsets:
            f.write('  ' + ', '.join(ss) + '\n')

In [37]:
!grep -A 5 '^pies:' 1-1-synonyms-result.txt

pies:
  skowyr, kejter, luńt, sobaka, czworonożny przyjaciel, stróż domu
  policjant, glina, gliniarz
  hind, skieł
skowyczeć:
  skowytać, skomleć, kwilić
