## Création d'un fichier CSV

In [None]:
import os
import pandas as pd
from random import sample
import stanza, spacy

In [None]:
def segment_sentences(text, language):
    if language == "ar":
        nlp = stanza.Pipeline(lang="ar", processors="tokenize", tokenize_no_ssplit=True)
        doc = nlp(text)
        sentences = [" ".join([token.text for token in sentence.tokens]) for sentence in doc.sentences]
    elif language == "ja":
        nlp = spacy.load("ja_core_news_sm")
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    elif language == "zh":
        nlp = spacy.load("zh_core_web_sm")
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    else:
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    return sentences

In [None]:
def create_csv(input_folder, output_csv):
    data = []
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".txt"):
            parts = file_name.split("_")
            if len(parts) >= 2:
                language = parts[1].split(".")[0]  # Extraire la langue avant le "."
                with open(os.path.join(input_folder, file_name), "r", encoding="utf-8") as file:
                    text = file.read()
                sentences = segment_sentences(text, language)
                data.extend([(language, sentence) for sentence in sentences])

    df = pd.DataFrame(data, columns=["labels", "text"])
    # Mélanger les lignes du DataFrame
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Créer les répertoires nécessaires s'ils n'existent pas déjà
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    # Sauvegarder le fichier CSV
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Le fichier de sortie CSV est bien généré : {output_csv}")

In [None]:
def main():
    input_folder = "./results/fichiers_clean/"
    output_csv = "./results/CSV/result.csv"
    create_csv(input_folder, output_csv)

In [None]:
if __name__ == "__main__":
    main()