# Modèle sur mesure avancé — TensorFlow / Keras

Ce notebook entraîne un modèle deep learning à partir des données préparées :


Les runs sont enregistrés dans la même expérience MLflow que le modèle baseline.

In [1]:
import os
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd

import mlflow
import mlflow.tensorflow

import tensorflow as tf
from tensorflow.keras import layers

from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    classification_report
)

TRAIN_PATH = "data/processed/train.csv"
VAL_PATH = "data/processed/val.csv"
TEST_PATH = "data/processed/test.csv"

TEXT_COL = "text"
LABEL_COL = "label"

RANDOM_STATE = 42

# MLflow (même dossier et même expérience que le notebook précédent)
tracking_path = Path("mlruns").resolve()
tracking_path.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(tracking_path.as_uri())
mlflow.set_experiment("AirParadis_Sentiment")

print("Tracking URI:", mlflow.get_tracking_uri())
print("Train exists :", Path(TRAIN_PATH).exists())
print("Val exists   :", Path(VAL_PATH).exists())
print("Test exists  :", Path(TEST_PATH).exists())

tf.keras.utils.set_random_seed(RANDOM_STATE)

  import pkg_resources  # noqa: TID251



Tracking URI: file:///C:/Users/Jeremy/IA/sentiment_tri/mlruns
Train exists : True
Val exists   : True
Test exists  : True


## Chargement des données

In [2]:
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

X_train = train_df[TEXT_COL].astype(str).values
y_train = train_df[LABEL_COL].astype(int).values

X_val = val_df[TEXT_COL].astype(str).values
y_val = val_df[LABEL_COL].astype(int).values

X_test = test_df[TEXT_COL].astype(str).values
y_test = test_df[LABEL_COL].astype(int).values

print("Train:", train_df.shape)
print("Val  :", val_df.shape)
print("Test :", test_df.shape)

Train: (960000, 2)
Val  : (320000, 2)
Test : (320000, 2)


## Fonctions métriques

In [3]:
def compute_metrics(y_true, y_proba):
    y_true = np.asarray(y_true).astype(int)
    y_proba = np.asarray(y_proba).astype(float)
    y_pred = (y_proba >= 0.5).astype(int)

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba),
    }

## Tokenization et padding

In [4]:
VOCAB_SIZE = 50_000
SEQ_LEN = 64
BATCH_SIZE = 256
EPOCHS = 3

print("VOCAB_SIZE:", VOCAB_SIZE)
print("SEQ_LEN:", SEQ_LEN)
print("BATCH_SIZE:", BATCH_SIZE)
print("EPOCHS:", EPOCHS)

VOCAB_SIZE: 50000
SEQ_LEN: 64
BATCH_SIZE: 256
EPOCHS: 3


In [5]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=VOCAB_SIZE,
    oov_token="<OOV>"
)
tokenizer.fit_on_texts(X_train)

def texts_to_padded(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return tf.keras.preprocessing.sequence.pad_sequences(
        seq, maxlen=SEQ_LEN, padding="post", truncating="post"
    )

X_train_pad = texts_to_padded(X_train)
X_val_pad = texts_to_padded(X_val)
X_test_pad = texts_to_padded(X_test)

print("X_train_pad:", X_train_pad.shape)
print("X_val_pad  :", X_val_pad.shape)
print("X_test_pad :", X_test_pad.shape)

X_train_pad: (960000, 64)
X_val_pad  : (320000, 64)
X_test_pad : (320000, 64)


## Datasets TensorFlow

In [6]:
train_ds = (
    tf.data.Dataset.from_tensor_slices((X_train_pad, y_train))
    .shuffle(50_000, seed=RANDOM_STATE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

val_ds = (
    tf.data.Dataset.from_tensor_slices((X_val_pad, y_val))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

test_ds = (
    tf.data.Dataset.from_tensor_slices((X_test_pad, y_test))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

train_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 64), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

## Modèle Keras

In [7]:
EMBED_DIM = 128

model = tf.keras.Sequential([
    layers.Input(shape=(SEQ_LEN,)),
    layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM),
    layers.Bidirectional(layers.LSTM(64, return_sequences=False)),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid"),
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 128)           6400000   
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                      

## Entraînement et évaluation

In [8]:
with mlflow.start_run(run_name="sur_mesure_avance_keras"):

    mlflow.log_param("model_type", "sur_mesure_avance_keras")
    mlflow.log_param("vocab_size", VOCAB_SIZE)
    mlflow.log_param("seq_len", SEQ_LEN)
    mlflow.log_param("embed_dim", EMBED_DIM)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("epochs", EPOCHS)

    t0 = time.time()
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS,
        verbose=1
    )
    mlflow.log_metric("train_time_sec", time.time() - t0)

    val_proba = model.predict(X_val_pad, batch_size=BATCH_SIZE).ravel()
    test_proba = model.predict(X_test_pad, batch_size=BATCH_SIZE).ravel()

    val_metrics = compute_metrics(y_val, val_proba)
    test_metrics = compute_metrics(y_test, test_proba)

    for k, v in val_metrics.items():
        mlflow.log_metric("val_" + k, float(v))
    for k, v in test_metrics.items():
        mlflow.log_metric("test_" + k, float(v))

    mlflow.tensorflow.log_model(model, "model")

print("Validation metrics:", val_metrics)
print("Test metrics:", test_metrics)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Epoch 1/3


Epoch 2/3
Epoch 3/3




INFO:tensorflow:Assets written to: C:\Users\Jeremy\AppData\Local\Temp\tmpvtsuofky\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\Jeremy\AppData\Local\Temp\tmpvtsuofky\model\data\model\assets


Validation metrics: {'accuracy': 0.82160625, 'f1': 0.822288218959742, 'precision': 0.8191567221574416, 'recall': 0.82544375, 'roc_auc': 0.8993071121875}
Test metrics: {'accuracy': 0.8199625, 'f1': 0.8203677951621654, 'precision': 0.8185251552369931, 'recall': 0.82221875, 'roc_auc': 0.8984801896484375}




## Analyse rapide (validation)

In [9]:
val_pred = (val_proba >= 0.5).astype(int)
print(classification_report(y_val, val_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82    160000
           1       0.82      0.83      0.82    160000

    accuracy                           0.82    320000
   macro avg       0.82      0.82      0.82    320000
weighted avg       0.82      0.82      0.82    320000



## Export du modèle

In [10]:
export_dir = "exported_model"
if os.path.exists(export_dir):
    import shutil
    shutil.rmtree(export_dir)

model.save(export_dir)

os.makedirs("artifacts", exist_ok=True)

with open("artifacts/sur_mesure_avance_keras_metrics.json", "w", encoding="utf-8") as f:
    json.dump({"val": val_metrics, "test": test_metrics}, f, indent=2, ensure_ascii=False)

print("Saved:")
print("-", export_dir)
print("- artifacts/sur_mesure_avance_keras_metrics.json")

INFO:tensorflow:Assets written to: exported_model\assets


INFO:tensorflow:Assets written to: exported_model\assets


Saved:
- exported_model
- artifacts/sur_mesure_avance_keras_metrics.json


In [11]:
import os
import json

os.makedirs("artifacts", exist_ok=True)

with open("artifacts/tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())

with open("artifacts/preprocess_config.json", "w", encoding="utf-8") as f:
    json.dump({"seq_len": SEQ_LEN}, f, indent=2, ensure_ascii=False)

print("OK - fichiers créés :")
print("- artifacts/tokenizer.json")
print("- artifacts/preprocess_config.json")

OK - fichiers créés :
- artifacts/tokenizer.json
- artifacts/preprocess_config.json
