In [31]:
import xml.sax
import re

In [40]:
def get_word_if_french (token):
    # for inputs like '[[anc#fr|anc]]'
    before_hash, after_hash = token.split('#')
    lang_code = after_hash.split('|')[0]
    if lang_code == 'fr':
        return before_hash[4:]

def extract_synon(line):
    synonyms = set()
    # restrict links to translated words
    if 'lien' not in line:
        matches = re.findall(r'(?<=\[\[).*?(?=\])', line)
        for item in matches:
            if '#' in item:
                word = get_word_if_french(item)
                if word:
                    synonyms.add(word)
            else:
                 synonyms.add(item)
    return synonyms

def parse_synonyms (text):
    lines = [line for line in text.splitlines() if line.strip()]
    syn_idx_start, syn_idx_end = 0, 0
    is_in_french, is_in_syn = False, False 
    synonyms = set()
    for line in lines:
        # restrict foreign words
        if line == '== {{langue|fr}} ==':
            is_in_french = True
        if line == '==== {{S|synonymes}} ====':
            is_in_syn = True
        elif is_in_syn and line.startswith('=') :
            break
        elif is_in_syn and is_in_french and line and line.startswith('*'):
            synonyms = extract_synon(line)
    return synonyms  

In [41]:
synonymes = {}
f = open('output.txt', "a")

class SynonymsHandler( xml.sax.ContentHandler ):
    def __init__(self):
        self.CurrentData = ""
        self.word = ""
        self.text = ""
        self.has_synonym_regex = re.compile(r"{{S\|synonymes}}")
                
    def startElement(self, tag, attributes):
        self.CurrentData = tag

    def endElement(self, tag):
        if self.CurrentData == 'text':
            # python code here
            if self.has_synonym_regex.search(self.text) and not self.word.startswith('Wiktionnaire:Patron'): 
                synonyms = parse_synonyms(self.text)
                if synonyms:
                    f.write(f'{self.word} ~ {parse_synonyms(self.text)}\n')
                self.text = ""
            self.word = ""
        self.CurrentData = ""
            
    def characters(self, content):
        if self.CurrentData == 'title':
            self.word += content
        if self.CurrentData == "text":
            self.text += content

In [42]:
%%time
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
Handler = SynonymsHandler()
parser.setContentHandler(Handler)
parser.parse("1.xml")

CPU times: user 19min 46s, sys: 2min 41s, total: 22min 28s
Wall time: 41min 39s


In [45]:
output = open('output.txt')
for i in range(40):
    print(output.readline().strip())

accueil ~ {'page d’accueil', 'web'}
lire ~ {'livre'}
siège ~ {'blocus'}
fauteuil ~ {'luie stoel'}
meuble ~ {'mobilier'}
militaire ~ {'guerrier'}
manchot ~ {'unibrassiste'}
bande dessinée ~ {'roman graphique'}
oiseau ~ {'piaf'}
allemand ~ {'langue des chevaux'}
vendredi ~ {'i'}
voler ~ {'tirer'}
poisson ~ {'matsyasana'}
armée ~ {'drapeau'}
sinogramme ~ {'kanji'}
kanji ~ {'sinogramme'}
jour ~ {'journée'}
CD-ROM ~ {'DOC', 'disque optique compact'}
photographie ~ {'prise de vue'}
ordinateur ~ {'PC'}
année ~ {'classe'}
mars ~ {'vårmånad'}
mai ~ {'mois de Marie'}
computer ~ {'ordinateur'}
anglais ~ {'anglophone'}
lieu ~ {'place'}
interrogation ~ {'interro'}
hypothèse ~ {'postulat'}
collaboration ~ {'collaborativité'}
prendre ~ {'attraper'}
le ~ {'lecʼh'}
la ~ {'na'}
fin ~ {'finman'}
abréviation ~ {'abréviature'}
chinois ~ {'noich'}
espagnol ~ {'langue espagnole'}
neuf ~ {'nouveau'}
et ~ {'-que'}
un ~ {'quelque'}
nord ~ {'septentrion'}
