## Création d'un fichier CSV

In [1]:
import os
import pandas as pd
from random import sample
import stanza, spacy



In [2]:
def segment_sentences(text, language):
    if language == "ar":
        nlp = stanza.Pipeline(lang="ar", processors="tokenize", tokenize_no_ssplit=True)
        doc = nlp(text)
        sentences = [" ".join([token.text for token in sentence.tokens]) for sentence in doc.sentences]
    elif language == "ja":
        nlp = spacy.load("ja_core_news_sm")
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    elif language == "zh":
        nlp = spacy.load("zh_core_web_sm")
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    else:
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    return sentences

In [3]:
def create_csv(input_folder, output_csv):
    data = []
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".txt"):
            parts = file_name.split("_")
            if len(parts) >= 2:
                language = parts[1].split(".")[0]  # Extraire la langue avant le "."
                with open(os.path.join(input_folder, file_name), "r", encoding="utf-8") as file:
                    text = file.read()
                sentences = segment_sentences(text, language)
                data.extend([(language, sentence) for sentence in sentences])

    df = pd.DataFrame(data, columns=["labels", "text"])
    # Mélanger les lignes du DataFrame
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Créer les répertoires nécessaires s'ils n'existent pas déjà
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    # Sauvegarder le fichier CSV
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Le fichier de sortie CSV est bien généré : {output_csv}")

In [4]:
def main():
    input_folder = "./results/fichiers_clean/"
    output_csv = "./results/CSV/result.csv"
    create_csv(input_folder, output_csv)

In [5]:
if __name__ == "__main__":
    main()

2024-05-12 00:55:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:55:22 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:55:22 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:55:22 INFO: Using device: cpu
2024-05-12 00:55:22 INFO: Loading: tokenize
2024-05-12 00:55:23 INFO: Loading: mwt
2024-05-12 00:55:23 INFO: Done loading processors!
2024-05-12 00:55:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:55:32 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:55:32 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:55:32 INFO: Using device: cpu
2024-05-12 00:55:32 INFO: Loading: tokenize
2024-05-12 00:55:32 INFO: Loading: mwt
2024-05-12 00:55:32 INFO: Done loading processors!
2024-05-12 00:55:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:55:40 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:55:40 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:55:40 INFO: Using device: cpu
2024-05-12 00:55:40 INFO: Loading: tokenize
2024-05-12 00:55:40 INFO: Loading: mwt
2024-05-12 00:55:40 INFO: Done loading processors!
2024-05-12 00:55:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:55:46 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:55:46 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:55:46 INFO: Using device: cpu
2024-05-12 00:55:46 INFO: Loading: tokenize
2024-05-12 00:55:46 INFO: Loading: mwt
2024-05-12 00:55:46 INFO: Done loading processors!
2024-05-12 00:55:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:55:54 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:55:54 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:55:54 INFO: Using device: cpu
2024-05-12 00:55:54 INFO: Loading: tokenize
2024-05-12 00:55:54 INFO: Loading: mwt
2024-05-12 00:55:54 INFO: Done loading processors!
2024-05-12 00:56:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:56:07 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:56:07 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:56:07 INFO: Using device: cpu
2024-05-12 00:56:07 INFO: Loading: tokenize
2024-05-12 00:56:07 INFO: Loading: mwt
2024-05-12 00:56:07 INFO: Done loading processors!
2024-05-12 00:56:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:56:29 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:56:29 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:56:29 INFO: Using device: cpu
2024-05-12 00:56:29 INFO: Loading: tokenize
2024-05-12 00:56:29 INFO: Loading: mwt
2024-05-12 00:56:29 INFO: Done loading processors!
2024-05-12 00:56:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:56:29 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:56:29 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:56:29 INFO: Using device: cpu
2024-05-12 00:56:29 INFO: Loading: tokenize
2024-05-12 00:56:29 INFO: Loading: mwt
2024-05-12 00:56:29 INFO: Done loading processors!
2024-05-12 00:56:42 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:56:42 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:56:42 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:56:42 INFO: Using device: cpu
2024-05-12 00:56:42 INFO: Loading: tokenize
2024-05-12 00:56:42 INFO: Loading: mwt
2024-05-12 00:56:42 INFO: Done loading processors!
2024-05-12 00:56:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:56:49 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:56:49 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:56:49 INFO: Using device: cpu
2024-05-12 00:56:49 INFO: Loading: tokenize
2024-05-12 00:56:49 INFO: Loading: mwt
2024-05-12 00:56:49 INFO: Done loading processors!
2024-05-12 00:57:05 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:06 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:06 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:06 INFO: Using device: cpu
2024-05-12 00:57:06 INFO: Loading: tokenize
2024-05-12 00:57:06 INFO: Loading: mwt
2024-05-12 00:57:06 INFO: Done loading processors!
2024-05-12 00:57:09 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:09 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:09 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:09 INFO: Using device: cpu
2024-05-12 00:57:09 INFO: Loading: tokenize
2024-05-12 00:57:09 INFO: Loading: mwt
2024-05-12 00:57:09 INFO: Done loading processors!
2024-05-12 00:57:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:13 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:13 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:13 INFO: Using device: cpu
2024-05-12 00:57:13 INFO: Loading: tokenize
2024-05-12 00:57:13 INFO: Loading: mwt
2024-05-12 00:57:13 INFO: Done loading processors!
2024-05-12 00:57:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:33 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:33 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:33 INFO: Using device: cpu
2024-05-12 00:57:33 INFO: Loading: tokenize
2024-05-12 00:57:33 INFO: Loading: mwt
2024-05-12 00:57:33 INFO: Done loading processors!
2024-05-12 00:57:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:34 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:34 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:34 INFO: Using device: cpu
2024-05-12 00:57:34 INFO: Loading: tokenize
2024-05-12 00:57:34 INFO: Loading: mwt
2024-05-12 00:57:35 INFO: Done loading processors!
2024-05-12 00:57:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:37 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:37 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:37 INFO: Using device: cpu
2024-05-12 00:57:37 INFO: Loading: tokenize
2024-05-12 00:57:37 INFO: Loading: mwt
2024-05-12 00:57:37 INFO: Done loading processors!
2024-05-12 00:57:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:57:39 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:57:39 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:57:39 INFO: Using device: cpu
2024-05-12 00:57:39 INFO: Loading: tokenize
2024-05-12 00:57:39 INFO: Loading: mwt
2024-05-12 00:57:39 INFO: Done loading processors!
2024-05-12 00:58:07 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:58:07 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:58:07 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:58:07 INFO: Using device: cpu
2024-05-12 00:58:07 INFO: Loading: tokenize
2024-05-12 00:58:07 INFO: Loading: mwt
2024-05-12 00:58:07 INFO: Done loading processors!
2024-05-12 00:58:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:58:22 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:58:22 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:58:22 INFO: Using device: cpu
2024-05-12 00:58:22 INFO: Loading: tokenize
2024-05-12 00:58:22 INFO: Loading: mwt
2024-05-12 00:58:22 INFO: Done loading processors!
2024-05-12 00:58:22 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-12 00:58:23 INFO: Downloaded file to /home/zia/stanza_resources/resources.json
2024-05-12 00:58:23 INFO: Loading these models for language: ar (Arabic):
| Processor | Package |
-----------------------
| tokenize  | padt    |
| mwt       | padt    |

2024-05-12 00:58:23 INFO: Using device: cpu
2024-05-12 00:58:23 INFO: Loading: tokenize
2024-05-12 00:58:23 INFO: Loading: mwt
2024-05-12 00:58:23 INFO: Done loading processors!


Le fichier de sortie CSV est bien généré : ./results/CSV/result.csv
