In [1]:
import glob
import pandas as pd
import os
from lxml import etree
import re
import nltk
from nltk.tokenize import RegexpTokenizer

In [2]:
files = glob.glob("./xml/*.xml")

---

### Dataset entier

In [14]:
def full_datasets_extraction(language, files_list):    
    
    all_textes = []
    all_arks = []
    all_auteurs = []
    all_typ = []
    all_index = []
    
    if language == "co":
        path = '//rubrique[@lan="co"]/'
        name = 'corsican'
    elif language == "fr":
        path = '//rubrique[@lan="fr"]/'
        name = 'french'
    elif language == "it":
        path = '//rubrique[@lan="it"]/'
        name = 'italian'
    else:
        raise ValueError("Non recognized language. Available languages identifiers ==> 'co', 'fr', 'it'.")
    
    for xml in files_list:

        if re.search(r"(bpt\w*)", xml):
            ark = re.search(r"(bpt\w*)", xml).group(1)

        tree = etree.parse(xml)

        arks = []
        textes = []
        index = []
        typ = []
        auteur = []
        
        for tag in tree.xpath(path + 'texte'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            textes.append(x)

            arks.append(ark)

        for tag in tree.xpath(path + 'auteur'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            auteur.append(x)

        for tag in tree.xpath(path + 'type'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            typ.append(x)

        for tag in tree.xpath(path + 'index'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            index.append(x)

        all_textes.extend(textes)
        all_auteurs.extend(auteur)
        all_index.extend(index)
        all_typ.extend(typ)
        all_arks.extend(arks)
    
    dic = {"Texts":all_textes, "Auteurs":all_auteurs, "Type":all_typ, "Position":all_index, "Arks":all_arks}
    df = pd.DataFrame(dic)
    df.to_csv("full_" + name + "_dataset.tsv", '\t')

---

### Datasets d'entraînements et de tests pour le Corse

In [16]:
def training_test_corpus_selection(dataset, language, limit):

    if language == "co":
        name = 'corsican'
    elif language == "fr":
        name = 'french'
    elif language == "it":
        name = 'italian'
    else:
        raise ValueError("Non recognized language. Available languages identifiers ==> 'co', 'fr', 'it'.")
        
    df = pd.read_csv(dataset, '\t')
    
    df_test = df.iloc[:limit]
    df_test = pd.DataFrame(df_test)
    df_test.to_csv("test_" + name + ".tsv", '\t')
    df_train = df.iloc[limit + 1:]
    df_train = pd.DataFrame(df_train)
    df_train.to_csv("train_" + name + ".tsv", '\t')

---

### Hapax et duplicates

In [22]:
def hapax_counter(freq_tokens):
    list_hapax = []
    for k, v in freq_tokens.items():
        if 1 == v:
            list_hapax.append(k)
    return list_hapax

In [23]:
def duplicates_counter(freq_tokens):    
    duplicates = []
    for k, v in freq_tokens.items():
        if 1 < v:
            duplicates.append(k)
    return duplicates

---

### Document term matrix

In [66]:
def document_term_matrix(list_files, language):
    
    if language == "co":
        path = '//rubrique[@lan="co"]/'
        name = 'corsican'
    elif language == "fr":
        path = '//rubrique[@lan="fr"]/'
        name = 'french'
    elif language == "it":
        path = '//rubrique[@lan="it"]/'
        name = 'italian'
    else:
        raise ValueError("Non recognized language. Available languages identifiers ==> 'co', 'fr', 'it'.")
    
    liste_textes = []
    
    for xml in list_files:

        tree = etree.parse(xml)

        textes = []
        
        for tag in tree.xpath(path + 'texte'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            textes.append(x)

        liste_textes.extend(textes)    
    
    tokens = ' '.join(liste_textes)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(tokens)
    
    frequence = dict(nltk.FreqDist(tokens))
    
    df = pd.DataFrame(frequence, index=["Frequency"])
    df.index.name = "Word"
    df = df.T
    
    df.to_csv("documentTermMatrix_" + name + ".tsv", '\t')

---

### Vocabulaire

In [32]:
def vocabulary_counter(list_files, language):

    if language == "co":
        path = '//rubrique[@lan="co"]/'
        name = 'corsican'
    elif language == "fr":
        path = '//rubrique[@lan="fr"]/'
        name = 'french'
    elif language == "it":
        path = '//rubrique[@lan="it"]/'
        name = 'italian'
    else:
        raise ValueError("Non recognized language. Available languages identifiers ==> 'co', 'fr', 'it'.")
    
    liste_textes = []
    
    for xml in list_files:

        tree = etree.parse(xml)

        textes = []
        
        for tag in tree.xpath(path + 'texte'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            textes.append(x)

        liste_textes.extend(textes)
        
    tokens = ' '.join(liste_textes)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(tokens)
    
    frequence = dict(nltk.FreqDist(tokens))
    
    hapax = hapax_counter(frequence)
    duplicates = duplicates_counter(frequence)
    
    vocabulary = duplicates + hapax
    
    with open("vocabulary_" + name + ".txt", "w") as f:
        for word in vocabulary:
            f.write(word + '\n')