# Optimisation du seuil métier

Nous optimisons le seuil de décision du modèle LightGBM en nous basant sur des probabilités out-of-fold (OOF) obtenues par validation croisée sur le jeu d’entraînement. Cette approche garantit que chaque probabilité utilisée pour choisir le seuil provient d’un modèle qui n’a pas vu l’observation correspondante, ce qui limite le risque de fuite et d’optimisme. Pour chaque seuil, nous calculons les erreurs (FP/FN) puis un coût métier défini comme 10×FN + 1×FP, cohérent avec le contexte de scoring crédit où rater un défaut est plus coûteux qu’un faux refus. Le seuil retenu est celui qui minimise ce coût sur OOF TRAIN, puis il est validé sur le jeu VALID indépendant pour vérifier la généralisation. Une fois les hyperparamètres et le seuil fixés, nous ré-entraînons un modèle final sur TRAIN+VALID et l’enregistrons dans MLflow avec le seuil métier associé, afin de préparer l’évaluation finale sur TEST et le déploiement.

## Imports + chemins + mlflow

In [1]:
import os
import sys
from pathlib import Path


CWD = Path.cwd()
PROJECT_ROOT = CWD.parent.parent
DB_PATH = (PROJECT_ROOT / "mlflow.db").resolve()
ARTIFACT_ROOT = (PROJECT_ROOT / "artifacts").resolve()
ARTIFACT_ROOT.mkdir(parents=True, exist_ok=True)

FEATURE_REDUCTION_DIR = PROJECT_ROOT / "reports" / "feature_reduction"
FEATURE_REDUCTION_DIR.mkdir(parents=True, exist_ok=True)

os.environ["MLFLOW_TRACKING_URI"] = f"sqlite:///{DB_PATH.as_posix()}"
os.environ["MLFLOW_ARTIFACT_URI"] = ARTIFACT_ROOT.as_uri()


sys.path.append(str(PROJECT_ROOT))

import mlflow  


mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])

print("CWD =", CWD)
print("Tracking URI =", mlflow.get_tracking_uri())
print("Artifacts root (env) =", os.environ["MLFLOW_ARTIFACT_URI"])

CWD = c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\notebooks\03_modeling
Tracking URI = sqlite:///C:/Users/yoann/Documents/open classrooms/projet 8/livrables/pret a dépenser/mlflow.db
Artifacts root (env) = file:///C:/Users/yoann/Documents/open%20classrooms/projet%208/livrables/pret%20a%20d%C3%A9penser/artifacts


In [2]:
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import numpy as np
from src.modeling.train import train_with_cv
from src.modeling.prepare_for_model import prepare_application_for_model
from src.tracking import mlflow_tracking
import  json
from datetime import datetime
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score,
    fbeta_score,
)

EXPERIMENT_NAME = "home_credit_threshold_optimization"
exp_id = mlflow_tracking.get_or_create_experiment(EXPERIMENT_NAME, ARTIFACT_ROOT)
mlflow.set_experiment(EXPERIMENT_NAME)
#mlflow ui --backend-store-uri sqlite:///mlflow.db

  from .autonotebook import tqdm as notebook_tqdm


<Experiment: artifact_location='file:///C:/Users/yoann/Documents/open%20classrooms/projet%208/livrables/pret%20a%20d%C3%A9penser/artifacts', creation_time=1771310824645, experiment_id='6', last_update_time=1771310824645, lifecycle_stage='active', name='home_credit_threshold_optimization', tags={}>

## Chargement

In [3]:
import sys
sys.path.append(str((PROJECT_ROOT / "src").as_posix()))
from modeling.prepare_for_model import prepare_application_for_model

DATA_DIR = PROJECT_ROOT / "data" / "processed"
TRAIN_PATH = DATA_DIR / "train_split.csv"
VALID_PATH = DATA_DIR / "valid_split.csv"
TEST_PATH  = DATA_DIR / "test_split.csv"   # (on ne l'utilise pas ici)

df_train = pd.read_csv(TRAIN_PATH)
df_valid = pd.read_csv(VALID_PATH)

X_train_full, y_train = prepare_application_for_model(df_train, model_type="boosting")
X_valid_full, y_valid = prepare_application_for_model(df_valid, model_type="boosting")

FEATURE_REDUCTION_DIR = PROJECT_ROOT / "reports" / "feature_reduction"
FEATURE_SET_NAME = "top125_nocorr"
kept_file = FEATURE_REDUCTION_DIR / f"kept_features_{FEATURE_SET_NAME}.txt"

if not kept_file.exists():
    raise FileNotFoundError(f"kept file introuvable: {kept_file}")

kept_features = [
    l.strip() for l in kept_file.read_text(encoding="utf-8").splitlines()
    if l.strip()
]
kept_features = [c for c in kept_features if c in X_train_full.columns]

if len(kept_features) == 0:
    raise ValueError("kept_features vide après intersection avec X_train_full")

X_train = X_train_full[kept_features].copy()
X_valid = X_valid_full[kept_features].copy()

print("Train:", X_train.shape, "Valid:", X_valid.shape)
print("Feature set:", FEATURE_SET_NAME, "| file:", kept_file)

Train: (215257, 125) Valid: (46127, 125)
Feature set: top125_nocorr | file: c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\feature_reduction\kept_features_top125_nocorr.txt


## Fonctions métriques + optimisation seuil

In [4]:
COST_FN = 10
COST_FP = 1
FBETA_BETA = 3
from src.modeling.metrics import find_best_threshold,compute_metrics


In [5]:
REPORTS_DIR = PROJECT_ROOT / "reports" / "hyperparameter_optimization"

BEST_LGB_PATH = REPORTS_DIR / f"optuna_best_lightgbm_top125_nocorr.json"
BEST_XGB_PATH = REPORTS_DIR / f"optuna_best_xgboost_top125_nocorr.json"
BEST_CB_PATH  = REPORTS_DIR / f"optuna_best_catboost_top125_nocorr.json"

for p in [BEST_LGB_PATH, BEST_XGB_PATH, BEST_CB_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Best params introuvable: {p}")

best_lgb = json.loads(BEST_LGB_PATH.read_text(encoding="utf-8"))
best_xgb = json.loads(BEST_XGB_PATH.read_text(encoding="utf-8"))
best_cb  = json.loads(BEST_CB_PATH.read_text(encoding="utf-8"))

params_lgb = best_lgb["best_params"]
params_xgb = best_xgb["best_params"]
params_cb  = best_cb["best_params"]

print("Loaded best params: LGB/XGB/CB OK")

Loaded best params: LGB/XGB/CB OK


## Préparation

### XGBOOST

In [6]:
from src.modeling.prepare_xgboost import prepare_xgb
X_train_xgb = prepare_xgb(X_train)
X_valid_xgb = prepare_xgb(X_valid)

### Catboost

In [8]:
from src.modeling.prepare_catboost import prepare_catboost_with_feature

X_train_cb, cat_idx = prepare_catboost_with_feature(X_train)
X_valid_cb, _       = prepare_catboost_with_feature(X_valid)

print("Nb features :", X_train_cb.shape[1])
print("Nb cat cols :", len(cat_idx))

Nb features : 125
Nb cat cols : 6


### Optimisation ligthGBM

In [9]:
base_lgb = {
    "objective": "binary",
    "class_weight": "balanced",
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1,
}
base_lgb.update(params_lgb)

model_lgb = LGBMClassifier(**base_lgb)
model_lgb.fit(X_train, y_train)

proba_valid_lgb = model_lgb.predict_proba(X_valid)[:, 1]

thresholds = np.linspace(0.01, 0.99, 99)  # grille rapide
best_lgb_thr, df_lgb_thr = find_best_threshold(
    y_valid, proba_valid_lgb, thresholds,
    cost_fn=COST_FN, cost_fp=COST_FP, beta=FBETA_BETA
)

print("BEST LGB threshold:", best_lgb_thr)
display(df_lgb_thr.head(10))

BEST LGB threshold: {'threshold': 0.52, 'business_cost': 22719.0, 'auc': 0.7838913090907957, 'recall': 0.664876476906552, 'precision': 0.1947306331104994, 'f1': 0.30123486830099155, 'fbeta_3': 0.5355713698600506, 'tn': 32164.0, 'fp': 10239.0, 'fn': 1248.0, 'tp': 2476.0}


Unnamed: 0,threshold,business_cost,auc,recall,precision,f1,fbeta_3,tn,fp,fn,tp
0,0.52,22719.0,0.783891,0.664876,0.194731,0.301235,0.535571,32164,10239,1248,2476
1,0.48,22779.0,0.783891,0.712943,0.180073,0.287524,0.550145,30314,12089,1069,2655
2,0.5,22785.0,0.783891,0.687433,0.186793,0.293763,0.542132,31258,11145,1164,2560
3,0.51,22785.0,0.783891,0.675349,0.190386,0.297036,0.538244,31708,10695,1209,2515
4,0.49,22789.0,0.783891,0.699517,0.183399,0.290607,0.545893,30804,11599,1119,2605
5,0.47,22803.0,0.783891,0.726369,0.17659,0.284109,0.553917,29790,12613,1019,2705
6,0.46,22881.0,0.783891,0.73899,0.17294,0.280287,0.556758,29242,13161,972,2752
7,0.54,22896.0,0.783891,0.637218,0.201803,0.30653,0.52413,33017,9386,1351,2373
8,0.55,22913.0,0.783891,0.625671,0.20614,0.310108,0.519869,33430,8973,1394,2330
9,0.53,22948.0,0.783891,0.647422,0.197154,0.302263,0.527052,32585,9818,1313,2411


In [10]:
base_xgb = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "hist",
    "max_bin": 128,
    "random_state": 42,
    "n_jobs": -1,
}
base_xgb.update(params_xgb)

model_xgb = XGBClassifier(**base_xgb)
model_xgb.fit(X_train_xgb, y_train, verbose=False)

proba_valid_xgb = model_xgb.predict_proba(X_valid_xgb)[:, 1]

best_xgb_thr, df_xgb_thr = find_best_threshold(
    y_valid, proba_valid_xgb, thresholds,
    cost_fn=COST_FN, cost_fp=COST_FP, beta=FBETA_BETA
)

print("BEST XGB threshold:", best_xgb_thr)
display(df_xgb_thr.head(10))

BEST XGB threshold: {'threshold': 0.46, 'business_cost': 22526.0, 'auc': 0.7881757100865808, 'recall': 0.7432867883995704, 'precision': 0.17592474895131563, 'f1': 0.2845102271559256, 'fbeta_3': 0.5620304568527918, 'tn': 29437.0, 'fp': 12966.0, 'fn': 956.0, 'tp': 2768.0}


Unnamed: 0,threshold,business_cost,auc,recall,precision,f1,fbeta_3,tn,fp,fn,tp
0,0.46,22526.0,0.788176,0.743287,0.175925,0.28451,0.56203,29437,12966,956,2768
1,0.48,22597.0,0.788176,0.71536,0.181707,0.289801,0.552961,30406,11997,1060,2664
2,0.47,22608.0,0.788176,0.728249,0.178421,0.28662,0.556696,29915,12488,1012,2712
3,0.5,22621.0,0.788176,0.688776,0.188658,0.296189,0.544447,31372,11031,1159,2565
4,0.49,22628.0,0.788176,0.700859,0.185133,0.292896,0.548158,30915,11488,1114,2610
5,0.45,22688.0,0.788176,0.752685,0.172164,0.28023,0.562885,28925,13478,921,2803
6,0.52,22714.0,0.788176,0.661117,0.196082,0.302457,0.534381,32309,10094,1262,2462
7,0.51,22744.0,0.788176,0.673201,0.191652,0.298364,0.538017,31829,10574,1217,2507
8,0.54,22745.0,0.788176,0.63695,0.204536,0.30964,0.525791,33178,9225,1352,2372
9,0.53,22769.0,0.788176,0.647959,0.199884,0.30552,0.529306,32744,9659,1311,2413


In [11]:
base_cb = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "random_seed": 42,
    "verbose": False,
    "auto_class_weights": "Balanced",
}

base_cb.update(params_cb)

model_cb = CatBoostClassifier(**base_cb)

# IMPORTANT : on passe cat_features ici
model_cb.fit(
    X_train_cb,
    y_train,
    cat_features=cat_idx,
)

proba_valid_cb = model_cb.predict_proba(X_valid_cb)[:, 1]

best_cb_thr, df_cb_thr = find_best_threshold(
    y_valid, proba_valid_cb, thresholds,
    cost_fn=COST_FN, cost_fp=COST_FP, beta=FBETA_BETA
)

print("BEST CATBOOST threshold:")
print(best_cb_thr)
display(df_cb_thr.head(10))

BEST CATBOOST threshold:
{'threshold': 0.5, 'business_cost': 22306.0, 'auc': 0.7874510353357697, 'recall': 0.7000537056928035, 'precision': 0.18969657280069854, 'f1': 0.29850575370699034, 'fbeta_3': 0.5516409572779788, 'tn': 31267.0, 'fp': 11136.0, 'fn': 1117.0, 'tp': 2607.0}


Unnamed: 0,threshold,business_cost,auc,recall,precision,f1,fbeta_3,tn,fp,fn,tp
0,0.5,22306.0,0.787451,0.700054,0.189697,0.298506,0.551641,31267,11136,1117,2607
1,0.51,22330.0,0.787451,0.687433,0.193208,0.301638,0.547406,31713,10690,1164,2560
2,0.52,22411.0,0.787451,0.672932,0.19675,0.304477,0.541803,32172,10231,1218,2506
3,0.53,22440.0,0.787451,0.659237,0.201147,0.308243,0.536952,32653,9750,1269,2455
4,0.49,22458.0,0.787451,0.709989,0.184869,0.293354,0.55293,30745,11658,1080,2644
5,0.48,22492.0,0.787451,0.723684,0.180909,0.289458,0.556669,30201,12202,1029,2695
6,0.54,22534.0,0.787451,0.645005,0.205019,0.31114,0.53104,33089,9314,1322,2402
7,0.47,22704.0,0.787451,0.732009,0.17644,0.284343,0.556713,29679,12724,998,2726
8,0.55,22765.0,0.787451,0.626208,0.208643,0.312999,0.521782,33558,8845,1392,2332
9,0.56,22920.0,0.787451,0.611439,0.212268,0.315134,0.514658,33953,8450,1447,2277


In [12]:
OUT_DIR = PROJECT_ROOT / "reports" / "threshold_optimization"
OUT_DIR.mkdir(parents=True, exist_ok=True)

summary = []

from src.tracking.tracking_treshold_log import log_and_save

# LGB
csv_lgb, json_lgb, pay_lgb = log_and_save("LightGBM", best_lgb_thr, df_lgb_thr, FEATURE_SET_NAME, kept_file, OUT_DIR,COST_FN,COST_FN,FBETA_BETA)
# XGB
csv_xgb, json_xgb, pay_xgb = log_and_save("XGBoost", best_xgb_thr, df_xgb_thr, FEATURE_SET_NAME, kept_file, OUT_DIR,COST_FN,COST_FN,FBETA_BETA)
# CB
csv_cb,  json_cb,  pay_cb  = log_and_save("CatBoost", best_cb_thr,  df_cb_thr,  FEATURE_SET_NAME, kept_file, OUT_DIR,COST_FN,COST_FN,FBETA_BETA)

print("Saved:")
print(csv_lgb); print(json_lgb)
print(csv_xgb); print(json_xgb)
print(csv_cb);  print(json_cb)

Saved:
c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\threshold_optimization\threshold_curve_lightgbm_top125_nocorr.csv
c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\threshold_optimization\best_threshold_lightgbm_top125_nocorr.json
c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\threshold_optimization\threshold_curve_xgboost_top125_nocorr.csv
c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\threshold_optimization\best_threshold_xgboost_top125_nocorr.json
c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\threshold_optimization\threshold_curve_catboost_top125_nocorr.csv
c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\reports\threshold_optimization\best_threshold_catboost_top125_nocorr.json
