# RANDOM FOREST - Benchmark individuel
Random Forest est un ensemble d’arbres de décision entraînés sur des sous‑échantillons aléatoires des données et des features.  
Le modèle combine ensuite les prédictions de tous les arbres pour obtenir un résultat plus stable et plus robuste qu’un arbre seul.


In [1]:
import os
import sys
from pathlib import Path


CWD = Path.cwd()
PROJECT_ROOT = CWD.parent.parent
DB_PATH = (PROJECT_ROOT / "mlflow.db").resolve()
ARTIFACT_ROOT = (PROJECT_ROOT / "artifacts").resolve()
ARTIFACT_ROOT.mkdir(parents=True, exist_ok=True)


os.environ["MLFLOW_TRACKING_URI"] = f"sqlite:///{DB_PATH.as_posix()}"
os.environ["MLFLOW_ARTIFACT_URI"] = ARTIFACT_ROOT.as_uri()


sys.path.append(str(PROJECT_ROOT))

import mlflow  


mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])

print("CWD =", CWD)
print("Tracking URI =", mlflow.get_tracking_uri())
print("Artifacts root (env) =", os.environ["MLFLOW_ARTIFACT_URI"])

CWD = c:\Users\yoann\Documents\open classrooms\projet 8\livrables\pret a dépenser\notebooks\02_benchmark
Tracking URI = sqlite:///C:/Users/yoann/Documents/open classrooms/projet 8/livrables/pret a dépenser/mlflow.db
Artifacts root (env) = file:///C:/Users/yoann/Documents/open%20classrooms/projet%208/livrables/pret%20a%20d%C3%A9penser/artifacts


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import mlflow
from src.modeling.train import train_with_cv
from src.modeling.prepare_for_model import prepare_application_for_model
from src.tracking import mlflow_tracking
from src.modeling.prepare_for_model import make_preprocessor

EXPERIMENT_NAME = "home_credit_benchmarking"
exp_id = mlflow_tracking.get_or_create_experiment(EXPERIMENT_NAME, ARTIFACT_ROOT)
mlflow.set_experiment(EXPERIMENT_NAME)



<Experiment: artifact_location='file:///C:/Users/yoann/Documents/open%20classrooms/projet%208/livrables/pret%20a%20d%C3%A9penser/artifacts', creation_time=1771138249350, experiment_id='1', last_update_time=1771138249350, lifecycle_stage='active', name='home_credit_benchmarking', tags={}>

In [3]:
df = pd.read_csv(PROJECT_ROOT / "data" / "processed" / "train_split.csv")

X_skl, y = prepare_application_for_model(df, model_type="sklearn")

print("X_skl:", X_skl.shape, "| y:", y.shape)

preprocessor, cols = make_preprocessor(X_skl)

print("Num cols :", len(cols["num"]))
print("Cat cols :", len(cols["cat"]))
print("Bool cols:", len(cols["bool"]))

X_skl: (215257, 1656) | y: (215257,)
Num cols : 1642
Cat cols : 14
Bool cols: 0


In [4]:


params_rf = {
    "n_estimators": 300,
    "max_depth": None,
    "min_samples_leaf": 50,
    "n_jobs": -1,
    "random_state": 42,
}

model_rf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),   # même preprocessing commun
        ("rf", RandomForestClassifier(**params_rf)),
    ]
)

results_rf = train_with_cv(
    model=model_rf,
    model_name="RandomForest",
    X=X_skl,
    y=y,
    model_type="sklearn",
    threshold=0.5,
    n_splits=5,
    random_state=42,
    log_fold_metrics=True,
    cost_fn=10,
    cost_fp=1,
    fbeta_beta=3,
)

results_rf


===== Entraînement (benchmark CV) : RandomForest =====


                 SimpleImputer(add_indicator=True, strategy='me...' (58329 characters) is truncated to 6000 characters to meet the length limit.



--- Fold 1/5 ---
   → AUC=0.7521 | Recall@0.50=0.0000 | F1@0.50=0.0000 | F3@0.50=0.0000 | Cost=34760
   → TN=39576 FP=0 FN=3476 TP=0 | fit=231.33s | pred=5.18s

--- Fold 2/5 ---
   → AUC=0.7348 | Recall@0.50=0.0000 | F1@0.50=0.0000 | F3@0.50=0.0000 | Cost=34760
   → TN=39576 FP=0 FN=3476 TP=0 | fit=407.30s | pred=5.22s

--- Fold 3/5 ---
   → AUC=0.7431 | Recall@0.50=0.0000 | F1@0.50=0.0000 | F3@0.50=0.0000 | Cost=34750
   → TN=39576 FP=0 FN=3475 TP=0 | fit=249.62s | pred=5.42s

--- Fold 4/5 ---
   → AUC=0.7460 | Recall@0.50=0.0000 | F1@0.50=0.0000 | F3@0.50=0.0000 | Cost=34750
   → TN=39576 FP=0 FN=3475 TP=0 | fit=242.44s | pred=5.18s

--- Fold 5/5 ---
   → AUC=0.7395 | Recall@0.50=0.0000 | F1@0.50=0.0000 | F3@0.50=0.0000 | Cost=34750
   → TN=39576 FP=0 FN=3475 TP=0 | fit=255.17s | pred=5.22s

===== Résultats finaux (CV) =====
AUC                         : 0.7431 ± 0.0058
Recall@0.50              : 0.0000 ± 0.0000
Precision@0.50           : 0.0000 ± 0.0000
F1@0.50                  : 0

{'model': 'RandomForest',
 'auc_mean': 0.7430831375360821,
 'auc_std': 0.0058284250484587664,
 'recall_mean_fixed_threshold': 0.0,
 'recall_std_fixed_threshold': 0.0,
 'precision_mean_fixed_threshold': 0.0,
 'precision_std_fixed_threshold': 0.0,
 'f1_mean_fixed_threshold': 0.0,
 'f1_std_fixed_threshold': 0.0,
 'fbeta_3_mean_fixed_threshold': 0.0,
 'fbeta_3_std_fixed_threshold': 0.0,
 'business_cost_mean_fixed_threshold': 34754.0,
 'business_cost_std_fixed_threshold': 4.898979485566356,
 'threshold': 0.5,
 'time_sec': 1438.2365238666534}