In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# --- 1. CHARGER LE DATASET ML FINAL ---
print("Chargement du dataset ML final...")
try:
    df_final_ml = pl.read_parquet("..Data/processed/cohort_FINAL_ML_READY.parquet")
    print(f"Dataset chargé. Shape: {df_final_ml.shape}")
except Exception as e:
    print("ERREUR: Fichier 'cohort_FINAL_ML_READY.parquet' non trouvé.")
    print("Assure-toi d'avoir lancé le notebook 01_EDA.ipynb d'abord.")

# --- 2. DÉFINITION DES FEATURES (ITÉRATION 1) ---
print("Définition des features pour le Modèle A (Démo uniquement)...")

# C'est notre 1ère "modification de features" : on ne prend QUE les features SIRENE
DEMO_FEATURES = [
    "categorieJuridiqueUniteLegale",
    "trancheEffectifsUniteLegale",
    "activitePrincipaleUniteLegale",
    # "categorieEntreprise",  # Souvent redondant avec trancheEffectifs
    # "economieSocialeSolidaireUniteLegale", # Souvent peu d'info
    # "societeMissionUniteLegale", # Souvent peu d'info
    "departement"
]

TARGET = "is_failed_in_3y"

# On convertit en Pandas pour Scikit-Learn
X = df_final_ml.select(DEMO_FEATURES).to_pandas()
y = df_final_ml.select(TARGET).to_pandas().squeeze() # .squeeze() pour en faire une Série

print(f"Features (X) sélectionnées: {X.columns.to_list()}")
print(f"Target (y) sélectionnée: {y.name}")

# --- 3. GESTION DU DÉSÉQUILIBRE ---
# On calcule le ratio pour 'scale_pos_weight'
# (Nombre de 0 / Nombre de 1)
scale_pos_weight = y.value_counts()[0] / y.value_counts()[1]
print(f"Ratio de déséquilibre (scale_pos_weight): {scale_pos_weight:.2f}")

# --- 4. PRÉPARATION (Preprocessing) ---
# Nos features sont toutes catégorielles. On doit les One-Hot Encoder.
categorical_features = DEMO_FEATURES
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# On crée un "ColumnTransformer" qui appliquera le OHE à nos colonnes
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="passthrough" # Ne touche pas aux autres colonnes (s'il y en avait)
)

# --- 5. SPLIT (Train / Test) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 6. CRÉATION DE LA PIPELINE DE MODÉLISATION ---
print("Création de la pipeline (Preprocessor + XGBoost)...")

model_A = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        scale_pos_weight=scale_pos_weight, 
        use_label_encoder=False, 
        eval_metric='logloss',
        random_state=42,
        enable_categorical=False # On le désactive car on fait un OHE
    ))
])

# --- 7. ENTRAÎNEMENT (Baseline) ---
print("Entraînement du Modèle A (Baseline)...")
model_A.fit(X_train, y_train)

# --- 8. ÉVALUATION (Le Score "Baseline") ---
print("Évaluation du Modèle A...")
y_pred = model_A.predict(X_test)
y_pred_proba = model_A.predict_proba(X_test)[:, 1]

# Scores
auc_score = roc_auc_score(y_test, y_pred_proba)
report = classification_report(y_test, y_pred, target_names=["Survivant (0)", "Faillite (1)"])

print("---")
print("--- RÉSULTATS DU MODÈLE A (BASELINE 'DÉMO') ---")
print(f"Score ROC-AUC (Baseline): {auc_score:.4f}")
print("---")
print("Rapport de Classification (Baseline):")
print(report)
print("---")

# Plot Matrice de Confusion
print("Matrice de Confusion (Baseline):")
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Survivant", "Faillite"])
disp.plot(cmap=plt.cm.Blues)
plt.show()

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/ugo/Documents/Cours/Master/Cours/M1/Supervised Learning/Final Project/.venv/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <B111F8D5-6AC6-3245-A6B5-94693F6992AB> /Users/ugo/Documents/Cours/Master/Cours/M1/Supervised Learning/Final Project/.venv/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]
