# Configuration

Définition des chemins, colonnes et paramètres principaux.

In [1]:
import os
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd

import mlflow
import mlflow.sklearn

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    classification_report
)

TRAIN_PATH = "data/processed/train.csv"
VAL_PATH = "data/processed/val.csv"
TEST_PATH = "data/processed/test.csv"

TEXT_COL = "text"
LABEL_COL = "label"

RANDOM_STATE = 42

# MLflow (Windows-friendly)
tracking_path = Path("mlruns").resolve()
tracking_path.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(tracking_path.as_uri())   # file:///C:/.../mlruns
mlflow.set_experiment("AirParadis_Sentiment")

print("Tracking URI:", mlflow.get_tracking_uri())
print("Train exists :", Path(TRAIN_PATH).exists())
print("Val exists   :", Path(VAL_PATH).exists())
print("Test exists  :", Path(TEST_PATH).exists())

  import pkg_resources  # noqa: TID251
2026/02/22 10:56:59 INFO mlflow.tracking.fluent: Experiment with name 'AirParadis_Sentiment' does not exist. Creating a new experiment.


Tracking URI: file:///C:/Users/Jeremy/IA/sentiment_tri/mlruns
Train exists : True
Val exists   : True
Test exists  : True


## Chargement des données

In [2]:
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

X_train = train_df[TEXT_COL].astype(str).values
y_train = train_df[LABEL_COL].astype(int).values

X_val = val_df[TEXT_COL].astype(str).values
y_val = val_df[LABEL_COL].astype(int).values

X_test = test_df[TEXT_COL].astype(str).values
y_test = test_df[LABEL_COL].astype(int).values

print("Train:", train_df.shape)
print("Val  :", val_df.shape)
print("Test :", test_df.shape)

Train: (960000, 2)
Val  : (320000, 2)
Test : (320000, 2)


In [3]:
def compute_metrics(y_true, y_proba):
    y_true = np.asarray(y_true).astype(int)
    y_proba = np.asarray(y_proba).astype(float)
    y_pred = (y_proba >= 0.5).astype(int)

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba),
    }

def log_metrics(metrics, prefix=""):
    for k, v in metrics.items():
        mlflow.log_metric(prefix + k, float(v))

## Définition du pipeline

In [4]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1, 2))),
    ("clf", LogisticRegression(
        max_iter=200,
        class_weight="balanced",
        random_state=RANDOM_STATE
    )),
])

pipe

## Entraînement et évaluation

In [5]:
with mlflow.start_run(run_name="baseline_tfidf_logreg"):

    mlflow.log_param("model_type", "baseline_tfidf_logreg")
    mlflow.log_param("tfidf_max_features", 50000)
    mlflow.log_param("tfidf_ngram_range", "1_2")
    mlflow.log_param("logreg_max_iter", 200)
    mlflow.log_param("logreg_class_weight", "balanced")

    t0 = time.time()
    pipe.fit(X_train, y_train)
    mlflow.log_metric("train_time_sec", time.time() - t0)

    val_proba = pipe.predict_proba(X_val)[:, 1]
    test_proba = pipe.predict_proba(X_test)[:, 1]

    val_metrics = compute_metrics(y_val, val_proba)
    test_metrics = compute_metrics(y_test, test_proba)

    log_metrics(val_metrics, "val_")
    log_metrics(test_metrics, "test_")

    mlflow.sklearn.log_model(pipe, "model")

print("Validation metrics:", val_metrics)
print("Test metrics:", test_metrics)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Validation metrics: {'accuracy': 0.81863125, 'f1': 0.8167031967508227, 'precision': 0.8254785040284995, 'recall': 0.8081125, 'roc_auc': 0.8979740389453126}
Test metrics: {'accuracy': 0.81783125, 'f1': 0.8155755079313858, 'precision': 0.825801160898479, 'recall': 0.8056, 'roc_auc': 0.89764619484375}


## Analyse rapide (validation)

In [6]:
val_proba = pipe.predict_proba(X_val)[:, 1]
val_pred = (val_proba >= 0.5).astype(int)

print("Predicted negative rate (val):", val_pred.mean())
print("Proba min / mean / max:", val_proba.min(), val_proba.mean(), val_proba.max())
print("\nClassification report (validation):")
print(classification_report(y_val, val_pred))

Predicted negative rate (val): 0.48948125
Proba min / mean / max: 0.00016878173138408495 0.5000276247223342 0.9999998947773963

Classification report (validation):
              precision    recall  f1-score   support

           0       0.81      0.83      0.82    160000
           1       0.83      0.81      0.82    160000

    accuracy                           0.82    320000
   macro avg       0.82      0.82      0.82    320000
weighted avg       0.82      0.82      0.82    320000



## Export du modèle

In [7]:
import joblib

os.makedirs("artifacts", exist_ok=True)

joblib.dump(pipe, "artifacts/baseline_tfidf_logreg.joblib")

with open("artifacts/baseline_tfidf_logreg_metrics.json", "w", encoding="utf-8") as f:
    json.dump({"val": val_metrics, "test": test_metrics}, f, indent=2, ensure_ascii=False)

print("Modèle et métriques sauvegardés.")

Modèle et métriques sauvegardés.
