In [1]:
# Importation des librairies nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import pickle
import tensorflow as tf
import gensim
import nltk


import mlflow
import mlflow.keras
from sklearn.metrics import roc_auc_score, accuracy_score
import pickle
import numpy as np


# Importation de Scikit-learn pour les modèles et métriques
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (accuracy_score, recall_score, f1_score, roc_auc_score, 
                             confusion_matrix, roc_curve)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Importation des modules de traitement de texte NLTK
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Importation de TensorFlow et Keras pour le Deep Learning
from tensorflow.keras import backend as K
from tensorflow.keras import utils, layers, metrics as kmetrics
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Dense, Dropout, Embedding, LSTM, Bidirectional, 
                                     TimeDistributed, Flatten, GlobalAveragePooling1D)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Importation de Gensim pour le traitement des modèles Word2Vec
from gensim.models import Word2Vec

# Importation de XGBoost
from xgboost import XGBClassifier

# Définition du chemin des données
path_data = '/Users/chretien/OpenClassroom/Openclassroom7/'


# 1. Import et traitement 

In [2]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", sep=',', encoding='ISO-8859-1', header=None,names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
# Garder uniquement colonnes target et text 
df = df[['target', 'text']]

# Remplacer target 4 par 1
df["target"] = df["target"].replace(4, 1)

# Sample
df_sample = df.groupby('target', as_index=False).apply(lambda x : x.sample(frac=0.001))
df_sample

Unnamed: 0,Unnamed: 1,target,text
0,774733,0,me want to watch transformers too
0,455075,0,Wow he said he's retiring! Sad!
0,527542,0,My car is broken - overheating. (Engine smokin...
0,60790,0,end of the festivities.. fuck
0,273179,0,Chris and I are now 520 points behind @binnsy ...
...,...,...,...
1,1362765,1,I enjoy your replies
1,1392443,1,Fun thing for today: Finding saved emails from...
1,1213913,1,@susanam90210 Thanks for liking my Mariachi pi...
1,1337828,1,"has had a bitch of a day. But hey, I'm still s..."


In [4]:
# Tokenizer

def tokenizer_fct(sentence) :
    word_tokens = word_tokenize(sentence)
    # print(word_tokens)
    return word_tokens


# Tokenizer split

def tokenizer_split_fct(sentence) :
    word_tokens = sentence.split(' ')
    # print(word_tokens)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english')))

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    # print(filtered_w)    
    return filtered_w

# lower case et alpha (not "@")
def lower_alpha_fct(list_words) :
    fw = [w.lower() for w in list_words if w.isalpha()]
    # print(fw)
    return fw

# lower case et alpha (not "@")
def lower_not_user_fct(list_words) :
    fw = [w.lower() for w in list_words if not w.startswith("@")]
    # print(fw
    return fw




#------------------------------Lemmatizer-----------------------------------


def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w
    
#------------------------------Stemming-----------------------------------


def stemma_fct(list_words) :
    stemming = PorterStemmer()
    stemma_w = [stemming.stem(w) for w in list_words]
    return stemma_w


#-------------------# Fonction de préparation des tweets----------------------------


# Fonction de préparation des questions
def transform_text(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    #lem_w = lemma_fct(lw)
    filtered_w = stop_word_filter_fct(lw)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence


# Fonction de préparation des questions
def transform_text_lem(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    lem_w = lemma_fct(lw)
    filtered_w = stop_word_filter_fct(lem_w)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence


# Fonction de préparation des questions
def transform_text_stemma(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    stemma_w = stemma_fct(lw)
    filtered_w = stop_word_filter_fct(stemma_w)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence



In [5]:
# afficher DataFrame clean 

tweets = pd.DataFrame()
tweets['target'] = df_sample['target']
tweets['text_base'] = df_sample['text'].apply(lambda x : transform_text(x))
tweets['text_lemma'] = df_sample['text'].apply(lambda x : transform_text_lem(x))
tweets['text_stem'] = df_sample['text'].apply(lambda x : transform_text_stemma(x))
tweets

Unnamed: 0,Unnamed: 1,target,text_base,text_lemma,text_stem
0,774733,0,want watch transformers,want watch transformer,want watch transform
0,455075,0,wow said he's retiring! sad!,wow said he's retiring! sad!,wow said he' retiring! sad!
0,527542,0,car broken - overheating. (engine smoking = ba...,car broken - overheating. (engine smoking = ba...,car broken - overheating. (engin smoke = bad) ...
0,60790,0,end festivities.. fuck,end festivities.. fuck,end festivities.. fuck
0,273179,0,chris 520 points behind sair.,chris 520 point behind sair.,chri 520 point behind sair.
...,...,...,...,...,...
1,1362765,1,enjoy replies,enjoy reply,enjoy repli
1,1392443,1,fun thing today: finding saved emails friends ...,fun thing today: finding saved email friend ye...,fun thing today: find save email friend year a...
1,1213913,1,thanks liking mariachi pic. i'll try find anot...,thanks liking mariachi pic. i'll try find anot...,thank like mariachi pic. i'll tri find anoth r...
1,1337828,1,"bitch day. hey, i'm still smiling","bitch day. hey, i'm still smiling","bitch day. hey, i'm still smile"


In [6]:
# Split Train test 
train0, df_test = train_test_split(tweets, test_size=0.2, random_state=42, shuffle=True)

# Split Train Val 
df_train, df_val = train_test_split(train0, test_size=0.25, random_state=42, shuffle=True)


In [7]:
y_train = df_train['target']
y_val = df_val['target']
y_test = df_test['target']

# W2V - Fonction

In [8]:
w2v_size=200 # Chaque mot sera représenté par un vecteur de taille 200
w2v_window=5 # le modèle regarde les 5 mots précédents et les 5 mots suivants
w2v_min_count=1 # tous les mots présents au moins une fois dans le corpus seront pris en compte dans l'apprentissage.
w2v_epochs=100 # Le modèle s'entraîne sur 100 itérations du corpus.
maxlen=200

 # Définition des paramètres en dur
max_sequence_len = 36  # Longueur maximale des séquences après padding



In [9]:
# Transforme notre target(0 ou 1) en un vecteur de 2 dimensions
# Uniquement pour W2V

def label_encode_fct(y_train, y_val, y_test) :
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    #print(y_train)
    num_classes = len(label_encoder.classes_)
    #print(num_classes)
    y_train = utils.to_categorical(y_train, num_classes)
    y_val = utils.to_categorical(y_val, num_classes)
    y_test = utils.to_categorical(y_test, num_classes)
    #print(y_train)

    return y_train, y_val, y_test

y_train, y_val, y_test = label_encode_fct(y_train,y_val, y_test)



In [10]:
import gensim
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_and_tokenize(train, val, test):
    """
    Fonction pour découper les textes en mots, les tokeniser, créer un modèle Word2Vec,
    et retourner le tokenizer, la taille du vocabulaire, les données tokenisées/padées, et les vecteurs Word2Vec.
    
    Args:
        train (pandas DataFrame): DataFrame contenant les textes d'entraînement (colonne 'text').
        val (pandas DataFrame): DataFrame contenant les textes de validation (colonne 'text').
        test (pandas DataFrame): DataFrame contenant les textes de test (colonne 'text').

    Returns:
        tokenizer (Tokenizer): Le tokenizer entraîné sur les textes d'entraînement.
        vocab_size (int): La taille du vocabulaire (nombre total de mots uniques + 1).
        x_train (numpy array): Données d'entraînement tokenisées et padées.
        x_val (numpy array): Données de validation tokenisées et padées.
        x_test (numpy array): Données de test tokenisées et padées.
        model_vectors (KeyedVectors): Vecteurs Word2Vec.
        w2v_words (list): Liste des mots du vocabulaire Word2Vec.
    """

   
    # Découper les textes en mots
    tweet_train_prep = [gensim.utils.simple_preprocess(text) for text in train]
    tweet_val_prep = [gensim.utils.simple_preprocess(text) for text in val]
    tweet_test_prep = [gensim.utils.simple_preprocess(text) for text in test]

    # Création du modèle Word2Vec
    w2v_model = gensim.models.Word2Vec(sentences=tweet_train_prep, vector_size=w2v_size, min_count=1, sg=1)
    model_vectors = w2v_model.wv
    w2v_words = model_vectors.index_to_key

    # Initialisation du tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweet_train_prep)
    
    # Tokenization et padding
    x_train = pad_sequences(tokenizer.texts_to_sequences(tweet_train_prep), maxlen=max_sequence_len, padding='post')
    x_val = pad_sequences(tokenizer.texts_to_sequences(tweet_val_prep), maxlen=max_sequence_len, padding='post')
    x_test = pad_sequences(tokenizer.texts_to_sequences(tweet_test_prep), maxlen=max_sequence_len, padding='post')
    
    # Taille du vocabulaire (+1 pour inclure le token 0 réservé au padding)
    vocab_size = len(tokenizer.word_index) + 1

    return tokenizer, vocab_size, x_train, x_val, x_test, model_vectors


In [11]:
import numpy as np

def create_embedding_matrix(tokenizer, model_vectors):
    """
    Crée une matrice d'embeddings à partir du tokenizer et du modèle de vecteurs de mots.

    Parameters:
    tokenizer (Tokenizer): Un objet Tokenizer de Keras utilisé pour transformer les mots en indices.
    model_vectors (KeyedVectors): Un modèle de vecteurs de mots de Gensim, où les clés sont les mots et les valeurs sont les vecteurs de mots.
    w2v_size (int): La taille des vecteurs d'embeddings.

    Returns:
    tuple: La matrice d'embeddings (numpy array) et le taux de couverture des mots (float).
    """
    
    # Index des mots dans le tokenizer
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1  # +1 pour le padding
    embedding_matrix = np.zeros((vocab_size, w2v_size))

    # Comptage du nombre de mots et de vecteurs trouvés
    num_words_with_vectors = 0
    total_words = len(word_index)
    
    for word, idx in word_index.items():
        if word in model_vectors:
            num_words_with_vectors += 1
            embedding_vector = model_vectors[word]
            if embedding_vector is not None and len(embedding_vector) == w2v_size:
                embedding_matrix[idx] = embedding_vector

    # Calcul du taux de couverture des mots
    word_rate = np.round(num_words_with_vectors / total_words, 4)
    
    print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
    print("Word coverage rate: %.4f" % word_rate)
    
    return embedding_matrix


In [12]:
def build_model_fct(embedding_matrix, vocab_size ) : 
    print("Build Keras model ...")

    dropout_level = 0.2

    k_model = Sequential()
    k_model.add(Embedding(vocab_size,
                        w2v_size,
                        weights=[embedding_matrix],
                        input_length=max_sequence_len,
                        trainable=True))

    k_model.add(Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.2, return_sequences=True)))
    k_model.add(GlobalAveragePooling1D())
    k_model.add(Dense(32, activation='relu'))
    k_model.add(Dropout(dropout_level))
    k_model.add(Dense(2, activation='softmax'))

    k_model.compile(loss='categorical_crossentropy',
                                optimizer='adam',
    #                            metrics=[kmetrics.AUC()])
                            metrics=['accuracy'])
    k_model.build(input_shape=(None, 200))  # None pour le batch size variable

    print(k_model.summary())

    return k_model
 

# MLFLOW - Import

In [13]:
import mlflow

# Ajouter l'URI du serveur de suivi MLflow
mlflow.set_tracking_uri('http://127.0.0.1:5003')

# Nom de l'expérience
experiment_name = "Model_avancee"

# Configurer MLflow pour utiliser l'expérience
if mlflow.get_experiment_by_name(experiment_name) is None:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)


<Experiment: artifact_location='mlflow-artifacts:/581637648930145205', creation_time=1730192161956, experiment_id='581637648930145205', last_update_time=1730192161956, lifecycle_stage='active', name='Model_avancee', tags={}>

# W2V - Base

In [14]:
import mlflow
import mlflow.keras
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

# Entraîner et enregistrer le premier modèle
def train_model_base_w2v():
    train = df_train['text_base']
    val = df_val['text_base']
    test = df_test['text_base']

    # Appliquer la fonction de prétraitement
    tokenizer_1, vocab_size_1, x_train, x_val, x_test, model_vectors_1 = preprocess_and_tokenize(train, val, test)

    # Calcul de la matrice d'embedding
    embedding_matrix_1 = create_embedding_matrix(tokenizer_1, model_vectors_1)
    embedding_matrix_1 = np.array(embedding_matrix_1)  # Conversion en tableau Numpy

    # Démarrer un run pour le premier modèle avec MLflow
    with mlflow.start_run(run_name="Model_Base_W2V"):

        # Enregistrer les hyperparamètres
        mlflow.log_param("vocab_size", vocab_size_1)
        mlflow.log_param("epochs", 50)
        mlflow.log_param("batch_size", 128)
        mlflow.log_param("early_stopping_patience", 5)

        # Création du modèle
        model_1 = build_model_fct(embedding_matrix_1, vocab_size_1)

        # Callbacks
        model_save_path = 'path_data/models/Model_base_W2V.weights.h5'
        save_model = ModelCheckpoint(filepath=model_save_path,
                                     save_weights_only=True,
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True,
                                     verbose=1)

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
        callbacks = [save_model, early_stopping]

        # Entraîner le modèle
        history_1 = model_1.fit(x_train, y_train,
                                epochs=50,
                                verbose=True,
                                validation_data=(x_val, y_val),
                                batch_size=128,
                                callbacks=callbacks)

        # Sauvegarder la configuration du modèle avec pickle
        model_config_1 = model_1.get_config()
        with open('path_data/models/Model_base_W2V_config.pkl', 'wb') as f:
            pickle.dump(model_config_1, f)

        # Charger les meilleurs poids du modèle
        model_1.load_weights(model_save_path)

        # Évaluer les performances sur les données de test
        y_pred_proba_1 = model_1.predict(x_test)

        # Calculer les prédictions binaires
        y_pred_1 = (y_pred_proba_1 > 0.5).astype(int)

        # Calculer les métriques de validation
        auc_score_W2V_base = roc_auc_score(y_test, y_pred_proba_1, average='macro')  # Changez 'macro' selon vos besoins
        accuracy_W2V_base = accuracy_score(y_test, y_pred_1)
        precision_W2V_base = precision_score(y_test, y_pred_1, average='macro')  # Changez 'macro' selon vos besoins
        recall_W2V_base = recall_score(y_test, y_pred_1, average='macro')  # Changez 'macro' selon vos besoins
        f1_W2V_base = f1_score(y_test, y_pred_1, average='macro')  # Changez 'macro' selon vos besoins

        # Loguer les métriques dans MLflow
        mlflow.log_metric("AUC", auc_score_W2V_base)
        mlflow.log_metric("Accuracy", accuracy_W2V_base)
        mlflow.log_metric("Precision", precision_W2V_base)
        mlflow.log_metric("Recall", recall_W2V_base)
        mlflow.log_metric("F1 Score", f1_W2V_base)

        # Enregistrer le modèle avec MLflow
        mlflow.keras.log_model(model_1, "Model_Base_W2V")

# Exécuter la fonction pour entraîner le premier modèle
train_model_base_w2v()


Embedding matrix shape: (2981, 200)
Word coverage rate: 1.0000
Build Keras model ...




None
Epoch 1/50
[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 80ms/step - accuracy: 0.4938 - loss: 0.6932
Epoch 1: val_accuracy improved from -inf to 0.47813, saving model to path_data/models/Model_base_W2V.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 162ms/step - accuracy: 0.4926 - loss: 0.6933 - val_accuracy: 0.4781 - val_loss: 0.6935
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.5112 - loss: 0.6915
Epoch 2: val_accuracy improved from 0.47813 to 0.52188, saving model to path_data/models/Model_base_W2V.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step - accuracy: 0.5113 - loss: 0.6915 - val_accuracy: 0.5219 - val_loss: 0.6911
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.5448 - loss: 0.6892
Epoch 3: val_accuracy improved from 0.52188 to 0.52812, saving model to path_data/models/Model_base_W2V.weights.h5
[1m



# W2V - Lemma 

In [15]:
import mlflow
import mlflow.keras
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

# Entraîner et enregistrer le deuxième modèle
def train_model_lemma_w2v():
    train = df_train['text_lemma']
    val = df_val['text_lemma']
    test = df_test['text_lemma']

    # Appliquer la fonction de prétraitement
    tokenizer_2, vocab_size_2, x_train, x_val, x_test, model_vectors_2 = preprocess_and_tokenize(train, val, test)

    # Calcul de la matrice d'embedding
    embedding_matrix_2 = create_embedding_matrix(tokenizer_2, model_vectors_2)
    embedding_matrix_2 = np.array(embedding_matrix_2)  # Conversion en tableau Numpy

    # Démarrer un run pour le deuxième modèle avec MLflow
    with mlflow.start_run(run_name="Model_Lemma_W2V"):

        # Enregistrer les hyperparamètres
        mlflow.log_param("vocab_size", vocab_size_2)
        mlflow.log_param("epochs", 50)
        mlflow.log_param("batch_size", 128)
        mlflow.log_param("early_stopping_patience", 5)

        # Création du modèle
        model_2 = build_model_fct(embedding_matrix_2, vocab_size_2)

        # Callbacks
        model_save_path = 'path_data/models/Model_lemma_W2V.weights.h5'
        save_model = ModelCheckpoint(filepath=model_save_path,
                                     save_weights_only=True,
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True,
                                     verbose=1)

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
        callbacks = [save_model, early_stopping]

        # Entraîner le modèle
        history_2 = model_2.fit(x_train, y_train,
                                epochs=50,
                                verbose=True,
                                validation_data=(x_val, y_val),
                                batch_size=128,
                                callbacks=callbacks)

        # Sauvegarder la configuration du modèle avec pickle
        model_config_2 = model_2.get_config()
        with open('path_data/models/Model_lemma_W2V_config.pkl', 'wb') as f:
            pickle.dump(model_config_2, f)

        # Charger les meilleurs poids du modèle
        model_2.load_weights(model_save_path)

        # Évaluer les performances sur les données de test
        y_pred_proba_2 = model_2.predict(x_test)

        # Calculer les prédictions binaires
        y_pred_2 = (y_pred_proba_2 > 0.5).astype(int)

        # Calculer les métriques de validation
        auc_score_W2V_lemma = roc_auc_score(y_test, y_pred_proba_2, average='macro')  # Changez 'macro' selon vos besoins
        accuracy_W2V_lemma = accuracy_score(y_test, y_pred_2)
        precision_W2V_lemma = precision_score(y_test, y_pred_2, average='macro')  # Changez 'macro' selon vos besoins
        recall_W2V_lemma = recall_score(y_test, y_pred_2, average='macro')  # Changez 'macro' selon vos besoins
        f1_W2V_lemma = f1_score(y_test, y_pred_2, average='macro')  # Changez 'macro' selon vos besoins

        # Loguer les métriques dans MLflow
        mlflow.log_metric("AUC", auc_score_W2V_lemma)
        mlflow.log_metric("Accuracy", accuracy_W2V_lemma)
        mlflow.log_metric("Precision", precision_W2V_lemma)
        mlflow.log_metric("Recall", recall_W2V_lemma)
        mlflow.log_metric("F1 Score", f1_W2V_lemma)

        # Enregistrer le modèle avec MLflow
        mlflow.keras.log_model(model_2, "Model_Lemma_W2V")

# Exécuter la fonction pour entraîner le deuxième modèle
train_model_lemma_w2v()


Embedding matrix shape: (2900, 200)
Word coverage rate: 1.0000
Build Keras model ...




None
Epoch 1/50
[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 71ms/step - accuracy: 0.4890 - loss: 0.6931
Epoch 1: val_accuracy improved from -inf to 0.52188, saving model to path_data/models/Model_lemma_W2V.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - accuracy: 0.4914 - loss: 0.6933 - val_accuracy: 0.5219 - val_loss: 0.6921
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.5075 - loss: 0.6923
Epoch 2: val_accuracy did not improve from 0.52188
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 102ms/step - accuracy: 0.5054 - loss: 0.6925 - val_accuracy: 0.5219 - val_loss: 0.6919
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.5356 - loss: 0.6918
Epoch 3: val_accuracy did not improve from 0.52188
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - accuracy: 0.5360 - loss: 0.6918 - val_accuracy: 0.4781 



# W2v - Stemm

In [16]:
import mlflow
import mlflow.keras
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

# Entraîner et enregistrer le troisième modèle
def train_model_stemm_w2v():
    train = df_train['text_stem']
    val = df_val['text_stem']
    test = df_test['text_stem']

    # Appliquer la fonction de prétraitement
    tokenizer_3, vocab_size_3, x_train, x_val, x_test, model_vectors_3 = preprocess_and_tokenize(train, val, test)

    # Calcul de la matrice d'embedding
    embedding_matrix_3 = create_embedding_matrix(tokenizer_3, model_vectors_3)
    embedding_matrix_3 = np.array(embedding_matrix_3)  # Conversion en tableau Numpy

    # Démarrer un run pour le troisième modèle avec MLflow
    with mlflow.start_run(run_name="Model_Stemm_W2V"):

        # Enregistrer les hyperparamètres
        mlflow.log_param("vocab_size", vocab_size_3)
        mlflow.log_param("epochs", 50)
        mlflow.log_param("batch_size", 128)
        mlflow.log_param("early_stopping_patience", 5)

        # Création du modèle
        model_3 = build_model_fct(embedding_matrix_3, vocab_size_3)

        # Callbacks
        model_save_path = 'path_data/models/Model_stemm_W2V.weights.h5'
        save_model = ModelCheckpoint(filepath=model_save_path,
                                     save_weights_only=True,
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True,
                                     verbose=1)

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
        callbacks = [save_model, early_stopping]

        # Entraîner le modèle
        history_3 = model_3.fit(x_train, y_train,
                                epochs=50,
                                verbose=True,
                                validation_data=(x_val, y_val),
                                batch_size=128,
                                callbacks=callbacks)

        # Sauvegarder la configuration du modèle avec pickle
        model_config_3 = model_3.get_config()
        with open('path_data/models/Model_stemm_W2V_config.pkl', 'wb') as f:
            pickle.dump(model_config_3, f)

        # Charger les meilleurs poids du modèle
        model_3.load_weights(model_save_path)

        # Évaluer les performances sur les données de test
        y_pred_proba_3 = model_3.predict(x_test)

        # Calculer les prédictions binaires
        y_pred_3 = (y_pred_proba_3 > 0.5).astype(int)

        # Calculer les métriques de validation
        auc_score_W2V_stemm = roc_auc_score(y_test, y_pred_proba_3, average='macro')  # Changez 'macro' selon vos besoins
        accuracy_W2V_stemm = accuracy_score(y_test, y_pred_3)
        precision_W2V_stemm = precision_score(y_test, y_pred_3, average='macro')  # Changez 'macro' selon vos besoins
        recall_W2V_stemm = recall_score(y_test, y_pred_3, average='macro')  # Changez 'macro' selon vos besoins
        f1_W2V_stemm = f1_score(y_test, y_pred_3, average='macro')  # Changez 'macro' selon vos besoins

        # Loguer les métriques dans MLflow
        mlflow.log_metric("AUC", auc_score_W2V_stemm)
        mlflow.log_metric("Accuracy", accuracy_W2V_stemm)
        mlflow.log_metric("Precision", precision_W2V_stemm)
        mlflow.log_metric("Recall", recall_W2V_stemm)
        mlflow.log_metric("F1 Score", f1_W2V_stemm)

        # Enregistrer le modèle avec MLflow
        mlflow.keras.log_model(model_3, "Model_Stemm_W2V")

# Exécuter la fonction pour entraîner le troisième modèle
train_model_stemm_w2v()


Embedding matrix shape: (2799, 200)
Word coverage rate: 1.0000
Build Keras model ...




None
Epoch 1/50
[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 68ms/step - accuracy: 0.4986 - loss: 0.6943
Epoch 1: val_accuracy improved from -inf to 0.52188, saving model to path_data/models/Model_stemm_W2V.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.4963 - loss: 0.6942 - val_accuracy: 0.5219 - val_loss: 0.6925
Epoch 2/50
[1m7/8[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 80ms/step - accuracy: 0.5084 - loss: 0.6932
Epoch 2: val_accuracy did not improve from 0.52188
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 86ms/step - accuracy: 0.5107 - loss: 0.6932 - val_accuracy: 0.5219 - val_loss: 0.6927
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.4924 - loss: 0.6926
Epoch 3: val_accuracy did not improve from 0.52188
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 0.4919 - loss: 0.6926 - val_accuracy: 0.5219 -



# Glove - Fonction 

In [17]:
y_train = df_train['target']
y_val = df_val['target']
y_test = df_test['target']


num_words=40000
maxlen=200
embedding_dim = 200
filepath = '/Users/chretien/Desktop/OC7/glove.twitter.27B.200d.txt'

In [18]:

def preprocess_texts(train, val, test):
    """
    Prétraite les textes en les transformant en vecteurs de séquences et en les padant.

    Args:
        train_df (pandas DataFrame): DataFrame contenant les textes d'entraînement (colonne 'text').
        val_df (pandas DataFrame): DataFrame contenant les textes de validation (colonne 'text').
        test_df (pandas DataFrame): DataFrame contenant les textes de test (colonne 'text').
        num_words (int): Nombre maximum de mots à utiliser dans le Tokenizer.
        maxlen (int): Longueur maximale des séquences après padding.

    Returns:
        tokenizer (Tokenizer): Le tokenizer entraîné sur les textes d'entraînement.
        vocab_size (int): La taille du vocabulaire (nombre total de mots uniques + 1).
        X_train (numpy array): Données d'entraînement tokenisées et padées.
        X_val (numpy array): Données de validation tokenisées et padées.
        X_test (numpy array): Données de test tokenisées et padées.
    """
    # Initialiser le Tokenizer
    tokenizer = Tokenizer(num_words=num_words)
    
    # Entraîner le Tokenizer sur les textes d'entraînement
    tokenizer.fit_on_texts(train)
    
    # Convertir les textes en séquences d'entiers (tokenization)
    X_train = tokenizer.texts_to_sequences(train)
    X_val = tokenizer.texts_to_sequences(val)
    X_test = tokenizer.texts_to_sequences(test)
    
    # Ajouter 1 pour inclure le token 0 réservé au padding
    vocab_size = len(tokenizer.word_index) + 1
    print("Vocabulary size:", vocab_size)
    
    # Padding des séquences pour avoir une longueur fixe
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
    
    return tokenizer, vocab_size, X_train, X_val, X_test



In [19]:
import numpy as np

def create_embedding_matrix(filepath,vocab_size, word_index):
    """
    Crée une matrice d'embeddings pour les mots en utilisant un fichier de vecteurs de mots.

    Args:
        filepath (str): Chemin vers le fichier contenant les vecteurs de mots (format texte).
        word_index (dict): Dictionnaire où les clés sont les mots et les valeurs sont les indices de ces mots dans le tokenizer.
        embedding_dim (int): La taille des vecteurs d'embeddings.

    Returns:
        np.ndarray: La matrice d'embeddings, où chaque ligne correspond à un vecteur de mot.
    """
    vocab_size = len(word_index) + 1  # Ajouter 1 pour inclure l'index 0 réservé au padding
    embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialiser la matrice avec des zéros

    # Comptage pour le taux de couverture des mots
    i = 0
    j = 0

    # Lire le fichier de vecteurs de mots
    with open(filepath, encoding='utf8', errors='ignore') as f:
        for line in f:
            i += 1
            word, *vector = line.split()
            if word in word_index:
                j += 1
                idx = word_index[word]
                # Assigner le vecteur au mot correspondant
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    
    # Calculer le taux de couverture des mots
    word_rate = np.round((j / i) * 100, 4)
    print("Total number of lines read:", i)
    print("Number of words found in the word_index:", j)
    print("Word embedding rate: {}%".format(word_rate))
    
    return embedding_matrix



In [20]:
from keras.models import Sequential
from keras import layers
from keras.layers import Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Dropout

def create_model(vocab_size, embedding_matrix):
    """
    Crée et compile un modèle Keras pour la classification binaire du texte.

    Args:
        vocab_size (int): La taille du vocabulaire (nombre total de mots uniques + 1).
        embedding_dim (int): La taille des vecteurs d'embeddings.
        embedding_matrix (numpy array): La matrice d'embeddings à utiliser pour la couche d'embeddings.
        maxlen (int): Longueur maximale des séquences après padding.

    Returns:
        model (Sequential): Le modèle Keras construit et compilé.
    """
    print("Building Keras model...")

    # Initialisation du modèle séquentiel
    model = Sequential()

    # Couche d'Embedding
    model.add(layers.Embedding(input_dim=vocab_size,
                               output_dim=embedding_dim,
                               weights=[embedding_matrix],
                               input_length=maxlen,
                               trainable=True))

    # Couche LSTM bidirectionnelle
    model.add(Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.2, return_sequences=True)))
    
    # Couche de pooling global
    model.add(GlobalAveragePooling1D())
    
    # Couche Dense
    model.add(Dense(32, activation='relu'))
    
    # Couche Dropout
    model.add(Dropout(0.75))
    
    # Couche de sortie avec activation sigmoïde
    model.add(Dense(1, activation='sigmoid'))

    # Compilation du modèle
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # Construction du modèle avec une forme d'entrée variable pour le batch size
    model.build(input_shape=(None, maxlen))  # None pour le batch size variable

    # Affichage du résumé du modèle
    print(model.summary())

    return model



# Glove - Base

In [21]:
import mlflow
import mlflow.keras
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

# Entraîner et enregistrer le quatrième modèle
def train_model_base_glove():
    train = df_train['text_base']
    val = df_val['text_base']
    test = df_test['text_base']

    # Prétraiter les données
    tokenizer_4, vocab_size_4, X_train, X_val, X_test = preprocess_texts(train, val, test)

    # Créer la matrice d'embedding
    embedding_matrix_4 = create_embedding_matrix(filepath, vocab_size_4, tokenizer_4.word_index)

    # Démarrer un run pour le quatrième modèle avec MLflow
    with mlflow.start_run(run_name="Model_Base_Glove"):

        # Enregistrer les hyperparamètres
        mlflow.log_param("vocab_size", vocab_size_4)
        mlflow.log_param("epochs", 50)
        mlflow.log_param("batch_size", 128)
        mlflow.log_param("early_stopping_patience", 5)

        # Création du modèle
        model_4 = create_model(vocab_size_4, embedding_matrix_4)

        # Callbacks
        model_save_path = 'path_data/models/Model_base_Glove.weights.h5'
        save_model = ModelCheckpoint(filepath=model_save_path,
                                     save_weights_only=True,
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True,
                                     verbose=1)

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
        callbacks = [save_model, early_stopping]

        # Entraîner le modèle
        history_4 = model_4.fit(X_train, y_train,
                                epochs=50,
                                verbose=True,
                                validation_data=(X_val, y_val),
                                batch_size=128,
                                callbacks=callbacks)

        # Sauvegarder la configuration du modèle avec pickle
        model_config_4 = model_4.get_config()
        with open('path_data/models/Model_base_Glove_config.pkl', 'wb') as f:
            pickle.dump(model_config_4, f)

        # Charger les meilleurs poids du modèle
        model_4.load_weights(model_save_path)

        # Évaluer les performances sur les données de test
        y_pred_proba_4 = model_4.predict(X_test)

        # Calculer les prédictions binaires
        y_pred_4 = (y_pred_proba_4 > 0.5).astype(int)

        # Calculer les métriques de validation
        auc_score_Glove_base = roc_auc_score(y_test, y_pred_proba_4, average='macro')  # Changez 'macro' selon vos besoins
        accuracy_Glove_base = accuracy_score(y_test, y_pred_4)
        precision_Glove_base = precision_score(y_test, y_pred_4, average='macro')  # Changez 'macro' selon vos besoins
        recall_Glove_base = recall_score(y_test, y_pred_4, average='macro')  # Changez 'macro' selon vos besoins
        f1_Glove_base = f1_score(y_test, y_pred_4, average='macro')  # Changez 'macro' selon vos besoins

        # Loguer les métriques dans MLflow
        mlflow.log_metric("AUC", auc_score_Glove_base)
        mlflow.log_metric("Accuracy", accuracy_Glove_base)
        mlflow.log_metric("Precision", precision_Glove_base)
        mlflow.log_metric("Recall", recall_Glove_base)
        mlflow.log_metric("F1 Score", f1_Glove_base)

        # Enregistrer le modèle avec MLflow
        mlflow.keras.log_model(model_4, "Model_Base_Glove")

# Exécuter la fonction pour entraîner le quatrième modèle
train_model_base_glove()


Vocabulary size: 3121
Total number of lines read: 1193514
Number of words found in the word_index: 2784
Word embedding rate: 0.2333%
Building Keras model...




None
Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488ms/step - accuracy: 0.5076 - loss: 0.6977
Epoch 1: val_accuracy improved from -inf to 0.54062, saving model to path_data/models/Model_base_Glove.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 588ms/step - accuracy: 0.5056 - loss: 0.6980 - val_accuracy: 0.5406 - val_loss: 0.6929
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671ms/step - accuracy: 0.4909 - loss: 0.6965
Epoch 2: val_accuracy did not improve from 0.54062
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 777ms/step - accuracy: 0.4901 - loss: 0.6965 - val_accuracy: 0.5219 - val_loss: 0.6918
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685ms/step - accuracy: 0.4992 - loss: 0.6971
Epoch 3: val_accuracy did not improve from 0.54062
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 804ms/step - accuracy: 0.5001 - loss: 0.6970 - val_accuracy: 0.52



# Glove - lemma 

In [22]:
import mlflow
import mlflow.keras
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

# Entraîner et enregistrer le cinquième modèle
def train_model_lemma_glove():
    train = df_train['text_lemma']
    val = df_val['text_lemma']
    test = df_test['text_lemma']

    # Prétraiter les données
    tokenizer_5, vocab_size_5, X_train, X_val, X_test = preprocess_texts(train, val, test)

    # Créer la matrice d'embedding
    embedding_matrix_5 = create_embedding_matrix(filepath, vocab_size_5, tokenizer_5.word_index)

    # Démarrer un run pour le cinquième modèle avec MLflow
    with mlflow.start_run(run_name="Model_Lemma_Glove"):

        # Enregistrer les hyperparamètres
        mlflow.log_param("vocab_size", vocab_size_5)
        mlflow.log_param("epochs", 50)
        mlflow.log_param("batch_size", 128)
        mlflow.log_param("early_stopping_patience", 5)

        # Création du modèle
        model_5 = create_model(vocab_size_5, embedding_matrix_5)

        # Callbacks
        model_save_path = 'path_data/models/Model_lemma_Glove.weights.h5'
        save_model = ModelCheckpoint(filepath=model_save_path,
                                     save_weights_only=True,
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True,
                                     verbose=1)

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
        callbacks = [save_model, early_stopping]

        # Entraîner le modèle
        history_5 = model_5.fit(X_train, y_train,
                                epochs=50,
                                verbose=True,
                                validation_data=(X_val, y_val),
                                batch_size=128,
                                callbacks=callbacks)

        # Sauvegarder la configuration du modèle avec pickle
        model_config_5 = model_5.get_config()
        with open('path_data/models/Model_lemma_Glove_config.pkl', 'wb') as f:
            pickle.dump(model_config_5, f)

        # Charger les meilleurs poids du modèle
        model_5.load_weights(model_save_path)

        # Évaluer les performances sur les données de test
        y_pred_proba_5 = model_5.predict(X_test)

        # Calculer les prédictions binaires
        y_pred_5 = (y_pred_proba_5 > 0.5).astype(int)

        # Calculer les métriques de validation
        auc_score_Glove_lemma = roc_auc_score(y_test, y_pred_proba_5, average='macro')  # Changez 'macro' selon vos besoins
        accuracy_Glove_lemma = accuracy_score(y_test, y_pred_5)
        precision_Glove_lemma = precision_score(y_test, y_pred_5, average='macro')  # Changez 'macro' selon vos besoins
        recall_Glove_lemma = recall_score(y_test, y_pred_5, average='macro')  # Changez 'macro' selon vos besoins
        f1_Glove_lemma = f1_score(y_test, y_pred_5, average='macro')  # Changez 'macro' selon vos besoins

        # Loguer les métriques dans MLflow
        mlflow.log_metric("AUC", auc_score_Glove_lemma)
        mlflow.log_metric("Accuracy", accuracy_Glove_lemma)
        mlflow.log_metric("Precision", precision_Glove_lemma)
        mlflow.log_metric("Recall", recall_Glove_lemma)
        mlflow.log_metric("F1 Score", f1_Glove_lemma)

        # Enregistrer le modèle avec MLflow
        mlflow.keras.log_model(model_5, "Model_Lemma_Glove")

# Exécuter la fonction pour entraîner le cinquième modèle
train_model_lemma_glove()


Vocabulary size: 3038
Total number of lines read: 1193514
Number of words found in the word_index: 2704
Word embedding rate: 0.2266%
Building Keras model...




None
Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386ms/step - accuracy: 0.4771 - loss: 0.6948
Epoch 1: val_accuracy improved from -inf to 0.47813, saving model to path_data/models/Model_lemma_Glove.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 479ms/step - accuracy: 0.4772 - loss: 0.6949 - val_accuracy: 0.4781 - val_loss: 0.6945
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 667ms/step - accuracy: 0.5142 - loss: 0.6949
Epoch 2: val_accuracy did not improve from 0.47813
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 779ms/step - accuracy: 0.5133 - loss: 0.6950 - val_accuracy: 0.4781 - val_loss: 0.6933
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 752ms/step - accuracy: 0.4736 - loss: 0.6948
Epoch 3: val_accuracy improved from 0.47813 to 0.52188, saving model to path_data/models/Model_lemma_Glove.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[



# Glove - stemm

In [23]:
import mlflow
import mlflow.keras
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

# Entraîner et enregistrer le sixième modèle
def train_model_stemm_glove():
    train = df_train['text_stem']
    val = df_val['text_stem']
    test = df_test['text_stem']

    # Prétraiter les données
    tokenizer_6, vocab_size_6, X_train, X_val, X_test = preprocess_texts(train, val, test)

    # Créer la matrice d'embedding
    embedding_matrix_6 = create_embedding_matrix(filepath, vocab_size_6, tokenizer_6.word_index)

    # Démarrer un run pour le sixième modèle avec MLflow
    with mlflow.start_run(run_name="Model_Stemm_Glove"):

        # Enregistrer les hyperparamètres
        mlflow.log_param("vocab_size", vocab_size_6)
        mlflow.log_param("epochs", 50)
        mlflow.log_param("batch_size", 128)
        mlflow.log_param("early_stopping_patience", 5)

        # Création du modèle
        model_6 = create_model(vocab_size_6, embedding_matrix_6)

        # Callbacks
        model_save_path = 'path_data/models/Model_stemm_Glove.weights.h5'
        save_model = ModelCheckpoint(filepath=model_save_path,
                                     save_weights_only=True,
                                     monitor='val_accuracy',
                                     mode='max',
                                     save_best_only=True,
                                     verbose=1)

        early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
        callbacks = [save_model, early_stopping]

        # Entraîner le modèle
        history_6 = model_6.fit(X_train, y_train,
                                epochs=50,
                                verbose=True,
                                validation_data=(X_val, y_val),
                                batch_size=128,
                                callbacks=callbacks)

        # Sauvegarder la configuration du modèle avec pickle
        model_config_6 = model_6.get_config()
        with open('path_data/models/Model_stemm_Glove_config.pkl', 'wb') as f:
            pickle.dump(model_config_6, f)

        # Charger les meilleurs poids du modèle
        model_6.load_weights(model_save_path)

        # Évaluer les performances sur les données de test
        y_pred_proba_6 = model_6.predict(X_test)

        # Calculer les prédictions binaires
        y_pred_6 = (y_pred_proba_6 > 0.5).astype(int)

        # Calculer les métriques de validation
        auc_score_Glove_stemm = roc_auc_score(y_test, y_pred_proba_6, average='macro')  # Changez 'macro' selon vos besoins
        accuracy_Glove_stemm = accuracy_score(y_test, y_pred_6)
        precision_Glove_stemm = precision_score(y_test, y_pred_6, average='macro')  # Changez 'macro' selon vos besoins
        recall_Glove_stemm = recall_score(y_test, y_pred_6, average='macro')  # Changez 'macro' selon vos besoins
        f1_Glove_stemm = f1_score(y_test, y_pred_6, average='macro')  # Changez 'macro' selon vos besoins

        # Loguer les métriques dans MLflow
        mlflow.log_metric("AUC", auc_score_Glove_stemm)
        mlflow.log_metric("Accuracy", accuracy_Glove_stemm)
        mlflow.log_metric("Precision", precision_Glove_stemm)
        mlflow.log_metric("Recall", recall_Glove_stemm)
        mlflow.log_metric("F1 Score", f1_Glove_stemm)

        # Enregistrer le modèle avec MLflow
        mlflow.keras.log_model(model_6, "Model_Stemm_Glove")

# Exécuter la fonction pour entraîner le sixième modèle
train_model_stemm_glove()


Vocabulary size: 2941
Total number of lines read: 1193514
Number of words found in the word_index: 2504
Word embedding rate: 0.2098%
Building Keras model...




None
Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 524ms/step - accuracy: 0.5284 - loss: 0.6973
Epoch 1: val_accuracy improved from -inf to 0.47813, saving model to path_data/models/Model_stemm_Glove.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 626ms/step - accuracy: 0.5262 - loss: 0.6977 - val_accuracy: 0.4781 - val_loss: 0.6954
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 660ms/step - accuracy: 0.4977 - loss: 0.6959
Epoch 2: val_accuracy improved from 0.47813 to 0.52188, saving model to path_data/models/Model_stemm_Glove.weights.h5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 780ms/step - accuracy: 0.4971 - loss: 0.6962 - val_accuracy: 0.5219 - val_loss: 0.6918
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605ms/step - accuracy: 0.5046 - loss: 0.6929
Epoch 3: val_accuracy did not improve from 0.52188
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# test mlflow

!mlflow ui

# Sauvegarder mes transformers et model pour le best model 