In [3]:
import glob
import pandas as pd
import os
from lxml import etree
import re
import nltk
from nltk.tokenize import RegexpTokenizer
import stopwords_processing
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
files = glob.glob("./xml/*.xml")

---

## Fonctions basiques

### Language selection

In [5]:
def language_selection(language):
    if language == 'co':
        name = 'corsican'
        path = '//rubrique[@lan="co"]/'
    elif language == 'fr':
        name = 'french'
        path = '//rubrique[@lan="fr"]/'
    elif language == 'it':
        name = 'italian'
        path = '//rubrique[@lan="it"]/'
    else:
        raise ValueError('Wrong language identifier. Try with "co", "fr" or "it".')
        
    return name, path

---

### Extracting data from XML

In [6]:
def data_extraction(list_files, path):
    all_textes = []
    all_arks = []
    all_auteurs = []
    all_typ = []
    all_index = []

    for xml in list_files:

        if re.search(r"(bpt\w*)", xml):
            ark = re.search(r"(bpt\w*)", xml).group(1)

        tree = etree.parse(xml)

        arks = []
        textes = []
        index = []
        typ = []
        auteur = []

        for tag in tree.xpath(path + 'texte'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            textes.append(x)

            arks.append(ark)

        for tag in tree.xpath(path + 'auteur'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            auteur.append(x)

        for tag in tree.xpath(path + 'type'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            typ.append(x)

        for tag in tree.xpath(path + 'index'):
            x = tag.text
            x = re.sub(r'\n', r'', x)
            index.append(x)

        all_textes.extend(textes)
        all_auteurs.extend(auteur)
        all_index.extend(index)
        all_typ.extend(typ)
        all_arks.extend(arks)
        
    dic = {"Texts":all_textes, "Auteurs":all_auteurs, "Type":all_typ, "Position":all_index, "Arks":all_arks}
    df = pd.DataFrame(dic) 
    return df

---

### Hapax et duplicates

In [7]:
def hapax_counter(freq_tokens):
    list_hapax = []
    for k, v in freq_tokens.items():
        if 1 == v:
            list_hapax.append(k)
    return list_hapax

def duplicates_counter(freq_tokens):    
    duplicates = []
    for k, v in freq_tokens.items():
        if 1 < v:
            duplicates.append(k)
    return duplicates

---

## Fonctions de création de datasets

### Dataset entier


In [8]:
def full_dataset_selection(list_files, language, rem_stopwords = False):
    
    name, path = language_selection(language)
    
    df = data_extraction(files, path)
        
    
    if rem_stopwords is False:
        df.to_csv("./Full/full_" + name + "_dataset.tsv", '\t')
        print("full_" + name + "_dataset.tsv has been created with success.")
    else:
        df = stopwords_processing.delete_stopwords_from_dataframe(table=df, col_name="Texts")
        df.to_csv("./Full/full_" + name + "_withoutSW_dataset.tsv", '\t')
        print("full_" + name + "_withoutSW_dataset.tsv has been created with success.")


In [9]:
full_dataset_selection(files, 'fr')
full_dataset_selection(files, 'co')
full_dataset_selection(files, 'it')

full_french_dataset.tsv has been created with success.
full_corsican_dataset.tsv has been created with success.
full_italian_dataset.tsv has been created with success.


In [10]:
full_dataset_selection(files, 'co', rem_stopwords=True)
full_dataset_selection(files, 'fr', rem_stopwords=True)
full_dataset_selection(files, 'it', rem_stopwords=True)

Object -> dataframe.
Done.
full_corsican_withoutSW_dataset.tsv has been created with success.
Object -> dataframe.
Done.
full_french_withoutSW_dataset.tsv has been created with success.
Object -> dataframe.
Done.
full_italian_withoutSW_dataset.tsv has been created with success.


---

### Datasets d'entraînements et de tests pour le Corse

In [11]:
def train_test_datasets(list_files, language, threshold, rem_stopwords = False):
    
    name, path = language_selection(language)
    
    df = data_extraction(files, path)       
    
    df_test = df.iloc[:threshold]
    df_test = pd.DataFrame(df_test)
    
    df_train = df.iloc[threshold + 1:]
    df_train = pd.DataFrame(df_train)
    
    if rem_stopwords is False:
        df_test.to_csv("./Test/test_" + name + "_dataset.tsv", '\t')
        df_train.to_csv("./Train/train_" + name + "_dataset.tsv", '\t')
        
        print("Train and test dataset for " + name + " created with success.")
    else:
        df_test = stopwords_processing.delete_stopwords_from_dataframe(df_test, col_name="Texts")
        df_train = stopwords_processing.delete_stopwords_from_dataframe(df_train, col_name="Texts")
        
        df_test.to_csv("./Test/test_" + name + "_withoutSW_dataset.tsv", '\t')
        df_train.to_csv("./Train/train_" + name + "_withoutSW_dataset.tsv", '\t')
        
        print("Train and test dataset for " + name + ", without stopwords, created with success.")

In [12]:
train_test_datasets(files, 'co', 400)
train_test_datasets(files, 'fr', 90)
train_test_datasets(files, 'it', 70)

Train and test dataset for corsican created with success.
Train and test dataset for french created with success.
Train and test dataset for italian created with success.


In [13]:
train_test_datasets(files, 'co', 400, rem_stopwords=True)
train_test_datasets(files, 'fr', 90, rem_stopwords=True)
train_test_datasets(files, 'it', 70, rem_stopwords=True)

Object -> dataframe.
Done.
Object -> dataframe.
Done.
Train and test dataset for corsican, without stopwords, created with success.
Object -> dataframe.
Done.
Object -> dataframe.
Done.
Train and test dataset for french, without stopwords, created with success.
Object -> dataframe.
Done.
Object -> dataframe.
Done.
Train and test dataset for italian, without stopwords, created with success.


---

### Fréquences de mots

In [14]:
def word_frequency(list_files, language, rem_stopwords=False):

    name, path = language_selection(language)
    
    df = data_extraction(files, path)
    
    if rem_stopwords is True:
        df = stopwords_processing.delete_stopwords_from_dataframe(table=df, col_name="Texts")
    
    liste = df['Texts'].tolist()
    corpus = ' '.join(liste)

    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(corpus)

    frequence = dict(nltk.FreqDist(tokens))
    
    df = pd.DataFrame(frequence, index=["Frequency"])
    df.index.name = "Word"
    df = df.T
    
    if rem_stopwords is False:
        df.to_csv("./Frequency/WordFrequency_" + name + "_dataset.tsv", '\t')
        print("WordFrequency_" + name + "_dataset.tsv created with success.")
    else:
        df.to_csv("./Frequency/WordFrequency_" + name + "_withoutSW_dataset.tsv", '\t')
        print("WordFrequency_" + name + "_withoutSW_dataset.tsv created with success.")        

In [15]:
word_frequency(files, 'co')
word_frequency(files, 'fr')
word_frequency(files, 'it')

WordFrequency_corsican_dataset.tsv created with success.
WordFrequency_french_dataset.tsv created with success.
WordFrequency_italian_dataset.tsv created with success.


In [16]:
word_frequency(files, 'co', rem_stopwords=True)
word_frequency(files, 'fr', rem_stopwords=True)
word_frequency(files, 'it', rem_stopwords=True)

Object -> dataframe.
Done.
WordFrequency_corsican_withoutSW_dataset.tsv created with success.
Object -> dataframe.
Done.
WordFrequency_french_withoutSW_dataset.tsv created with success.
Object -> dataframe.
Done.
WordFrequency_italian_withoutSW_dataset.tsv created with success.


---
### Vocabulaire

In [17]:
def vocabulary_counter(list_files, language, rem_stopwords=False):

    name, path = language_selection(language)
    
    df = data_extraction(files, path)
    
    if rem_stopwords is True:
        df = stopwords_processing.delete_stopwords_from_dataframe(table=df, col_name="Texts")
    
    liste = df['Texts'].tolist()
    corpus = ' '.join(liste)


    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(corpus)

    frequence = dict(nltk.FreqDist(tokens))
    hapax = hapax_counter(frequence)
    duplicates = duplicates_counter(frequence)
    vocabulary = duplicates + hapax
    
    if rem_stopwords is False:
        with open("./Vocabulary/vocabulary_" + name + ".txt", "w") as f:
            for word in vocabulary:
                f.write(word + '\n')
        print("Vocabulary for " + name + " created with success.")
    else:
        with open("./Vocabulary/vocabulary_" + name + "_withoutSW.txt", "w") as f:
            for word in vocabulary:
                f.write(word + '\n')
        print("Vocabulary for " + name + ", without stopwords, created with success.")


In [18]:
vocabulary_counter(files, 'co')
vocabulary_counter(files, 'fr')
vocabulary_counter(files, 'it')

Vocabulary for corsican created with success.
Vocabulary for french created with success.
Vocabulary for italian created with success.


In [19]:
vocabulary_counter(files, 'co', rem_stopwords=True)
vocabulary_counter(files, 'fr', rem_stopwords=True)
vocabulary_counter(files, 'it', rem_stopwords=True)

Object -> dataframe.
Done.
Vocabulary for corsican, without stopwords, created with success.
Object -> dataframe.
Done.
Vocabulary for french, without stopwords, created with success.
Object -> dataframe.
Done.
Vocabulary for italian, without stopwords, created with success.


---

### Vocabulaire avec suppression des hapax et mots peu fréquents

In [24]:
def mfw_vocabulary(list_files, language, threshold, rem_stopwords = False):
    
    name, path = language_selection(language)
    
    df = data_extraction(files, path)
    
    if rem_stopwords is True:
        df = stopwords_processing.delete_stopwords_from_dataframe(table=df, col_name="Texts")
    
    liste = df['Texts'].tolist()
    corpus = ' '.join(liste)

    tokenizer = nltk.RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(corpus)
    
    freqdist = dict(nltk.FreqDist(tokens))
    mfw = {key:val for key, val in freqdist.items() if val >= threshold}
    print("Le vocabulaire restreint contient dorénavant " + str(len(mfw)) + " au lieu de " + str(len(freqdist)))
 
    
    if rem_stopwords is False:
        with open("./MFW/MFWVocabulary_" + str(threshold) + name + ".txt", "w") as f:
            for word in mfw:
                f.write(word + '\n')
        print("Vocabulary for " + name + " created with success.")
    else:
        with open("./MFW/MFWVocabulary_" + str(threshold) + name + "_withoutSW.txt", "w") as f:
            for word in mfw:
                f.write(word + '\n')
        print("Vocabulary for " + name + ", without stopwords, created with success.")

In [25]:
mfw_vocabulary(files, 'co', threshold=10)
mfw_vocabulary(files, 'fr', threshold=10)
mfw_vocabulary(files, 'it', threshold=10)

Le vocabulaire restreint contient dorénavant 7154 au lieu de 86825
Vocabulary for corsican created with success.
Le vocabulaire restreint contient dorénavant 3593 au lieu de 40625
Vocabulary for french created with success.
Le vocabulaire restreint contient dorénavant 2677 au lieu de 38927
Vocabulary for italian created with success.


In [26]:
mfw_vocabulary(files, 'co', threshold=10, rem_stopwords=True)
mfw_vocabulary(files, 'fr', threshold=10, rem_stopwords=True)
mfw_vocabulary(files, 'it', threshold=10, rem_stopwords=True)

Object -> dataframe.
Done.
Le vocabulaire restreint contient dorénavant 6764 au lieu de 86093
Vocabulary for corsican, without stopwords, created with success.
Object -> dataframe.
Done.
Le vocabulaire restreint contient dorénavant 3189 au lieu de 40001
Vocabulary for french, without stopwords, created with success.
Object -> dataframe.
Done.
Le vocabulaire restreint contient dorénavant 2226 au lieu de 38227
Vocabulary for italian, without stopwords, created with success.


---
### Document term matrix

In [74]:
def document_term_matrix(vocabulary, list_files, language, rem_stopwords=False):

    name, path = language_selection(language)
    
    df = data_extraction(list_files, path)
    
    corpus = df['Texts'].tolist()
    
    with open(vocabulary, 'r') as f:
        vocab = f.readlines()
    
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(vocab)
    td = vectorizer.transform(corpus)
    
    
    
    df1 = pd.DataFrame(td.todense())
    df1.columns = vectorizer.get_feature_names()
    term_document_matrix = df1.T
    
    
    if rem_stopwords is False:
        term_document_matrix.to_csv("./DocumentTermMatrix/DocumentTermMatrix_" + name + "_dataset.tsv", '\t')
        print("DocumentTermMatrix_" + name + "_dataset.tsv created with success.")
    else:
        term_document_matrix.to_csv("./DocumentTermMatrix/DocumentTermMatrix_" + name + "_withoutSW_dataset.tsv", '\t')
        print("DocumentTermMatrix_" + name + "_withoutSW_dataset.tsv created with success.")    

In [76]:
document_term_matrix("./MFW/MFWVocabulary_10corsican.txt", files, 'co')
document_term_matrix("./MFW/MFWVocabulary_10french.txt", files, 'fr')
document_term_matrix("./MFW/MFWVocabulary_10italian.txt", files, 'it')



DocumentTermMatrix_corsican_dataset.tsv created with success.




DocumentTermMatrix_french_dataset.tsv created with success.
DocumentTermMatrix_italian_dataset.tsv created with success.




In [79]:
document_term_matrix("./MFW/MFWVocabulary_10corsican_withoutSW.txt", files, 'co', rem_stopwords=True)
document_term_matrix("./MFW/MFWVocabulary_10french_withoutSW.txt", files, 'fr', rem_stopwords=True)
document_term_matrix("./MFW/MFWVocabulary_10italian_withoutSW.txt", files, 'it', rem_stopwords=True)



DocumentTermMatrix_corsican_withoutSW_dataset.tsv created with success.




DocumentTermMatrix_french_withoutSW_dataset.tsv created with success.
DocumentTermMatrix_italian_withoutSW_dataset.tsv created with success.


