### IMPORTATION DES LIBRAIRIES ###

In [None]:
import re
import nltk
import spacy
import gensim
import pandas as pd
from pprint import pprint
from neattext import functions
from wordcloud import WordCloud
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('vader_lexicon')
# ! python -m spacy download fr_core_news_sm
# ! pip install pyLDAvis
# ! pip install bertopic
# ! pip install neattext

### RECUPERATION DES DONNEES ###

In [None]:
dataset = pd.read_csv("./Samples/dataset_police_nationale.csv")

### TRAITEMENT DES DONNEES

In [None]:
# Fonction de récupération des dates d'au plus 2 ans
def select_dates():
    dataset["Dates"] = dataset["Dates"].apply(lambda date: date.replace("un", "1"))
    indices_to_keep = []
    indices_to_remove = []
    
    for indice in range(len(dataset["Dates"])):
        if "jours" in dataset["Dates"][indice] or "semaine" in dataset["Dates"][indice] or "mois" in dataset["Dates"][indice]:
            indices_to_keep.append(indice)
        elif "1 an" in dataset["Dates"][indice] or "2 ans" in dataset["Dates"][indice]:
            indices_to_keep.append(indice)
            
    for elt in range(len(dataset)):
        if elt not in indices_to_keep:
            indices_to_remove.append(elt)
            
    dataset.drop(indices_to_remove, axis=0, inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    
    return dataset

In [None]:
# Fonction de nettoyage des données
def clean_avis(avis):
    stop_words = set(stopwords.words('french')) # Charger les mots vides (stop words)
    stop_words.update(["tout", "er", "a", "h", "bien"])
    stop_words.update(stopwords.words('english'))
    cleaned_avis = functions.remove_emojis(avis)
    cleaned_avis = re.sub(r'\W+', ' ', cleaned_avis.lower())  # Supprimer les caractères non alphabétiques et convertir en minuscules
    cleaned_avis = re.sub(r'\d+', '', cleaned_avis)  # Supprimer les chiffres
    cleaned_avis = ' '.join([word for word in cleaned_avis.split() if word not in stop_words])  # Supprimer les mots vides
    return cleaned_avis

In [None]:
# Sélectionner les données dont les dates n'excèdent pas 2 ans
dataset = select_dates()

# Nettoyer les avis du jeu de données
dataset["Processing"] = dataset["Avis"].apply(clean_avis)

# Tokenisation des avis
dataset["Processing"] = dataset["Processing"].apply(nltk.word_tokenize)

### MODELE CAMEMBERT ###

In [None]:
# Modèle CamemBert

analyzer = pipeline(
    task='text-classification',
    model="cmarkea/distilcamembert-base-sentiment",
    tokenizer="cmarkea/distilcamembert-base-sentiment"
)

comments_list = [dataset["Avis"][elt] for elt in range(len(dataset["Avis"]))]

avis_list = []

# N'autoriser que 510 caractères par avis
for index in range(len(comments_list)):
    if len(comments_list[index]) > 510:
        phrase = f"{comments_list[index][:510]}"
    else:
        phrase = f"{comments_list[index]}"

    resultat = analyzer(
        phrase,
        return_all_scores=True
    )

    avis_list.append(resultat)

In [None]:
# Création des labels
score_list = []
labels_list = []
for index in range(len(avis_list)):
    for dict_list in avis_list[index]:
        for elt in dict_list:
            score_list.append(elt["score"])
        for elt in dict_list:
            if elt["score"] == max(score_list):
                if elt["label"] == "1 star" or elt["label"] == "2 stars":
                    labels_list.append("Négatif")
                elif elt["label"] == "4 stars" or elt["label"] == "5 stars":
                    labels_list.append("Positif")
                elif elt["label"] == "3 stars":
                    labels_list.append("Neutre")
        score_list = []

dataset["Labels"] = labels_list

In [None]:
# Récupération des commentaires négatifs

indices_to_keep = []
indices_to_remove = []

for indice in range(len(dataset["Labels"])):
    if dataset["Labels"][indice] == "Négatif":
        indices_to_keep.append(indice)
        
for index in range(len(dataset["Labels"])):
    if index not in indices_to_keep:
        indices_to_remove.append(index)
        
dataset.drop(indices_to_remove, axis=0, inplace=True)
dataset.reset_index(drop=True, inplace=True)

In [None]:
# Fonction de génération de topics
def get_topic(words_list):
    topic = [elt for elt in words_list.keys()]
    topic = topic[:4]
    topic = ' '.join(topic)
    topic = topic.split()

    mots_uniques = []

    for mot in topic:
        if mot not in mots_uniques:
            mots_uniques.append(mot)

    topic = ' '.join(mots_uniques)

    return topic

In [None]:
# Obtenir les topics à partir d'un objet WordCloud

long_string = []
topics_list = []

for index in range(len(dataset["Processing"])):
    for word in dataset["Processing"][index]:
        long_string.append(word)
        
    strg = ','.join(elt for elt in long_string)

    # Création d'un objet WordCloud
    wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

    # Générer le nuage de mots
    wordcloud.generate(strg)

    # Récupération des mots les plus importants
    words_list = wordcloud.words_
    
    topics_list.append(get_topic(words_list))
    
    long_string = []

dataset["Topics"] = topics_list