In [4]:
import polars as pl
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import numpy as np
import warnings
from typing import Dict, Any

warnings.filterwarnings('ignore')

# --- 0. Chargement et Préparation ---
FILE_PATH_ML = "../Data/processed/sirene_bilan_ML_prets.parquet" 
cible_col = "cible_HN_RésultatNet_T_plus_1"

try:
    df_ml = pl.read_parquet(FILE_PATH_ML)
except Exception as e:
    print(f"ERREUR : Impossible de charger le fichier ML. Vérifiez le chemin : {FILE_PATH_ML}")
    raise

df_ml_pd = df_ml.to_pandas()

# Les ensembles sont les données complètes (T=2017)
X_full = df_ml_pd.drop(columns=[cible_col, 'AnneeClotureExercice', 'siren']).fillna(0)
y_full = df_ml_pd[cible_col]

# Configuration de la Cross-Validation (Exigence du Professeur)
# Nous utilisons 5 splits car nous n'avons pas assez de périodes pour TimeSeriesSplit.
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Définition des scoreurs
scorer_rmse = make_scorer(mean_squared_error, squared=False)
scorer_mae = make_scorer(mean_absolute_error)

# Dictionnaire de suivi
tracking_results: Dict[str, Dict[str, Any]] = {}

print(f"Jeu de données complet (T=2017 -> Y=2018) chargé. Taille: {X_full.shape[0]} observations.")

Jeu de données complet (T=2017 -> Y=2018) chargé. Taille: 29960 observations.


In [5]:
# --- 1. BASELINE (Linear Regression - Ratios T) ---
features_baseline = [
    "ratio_rentabilite_nette",
    "ratio_endettement",
    "ratio_marge_brute"
]

X_baseline = X_full[features_baseline]
model_baseline = LinearRegression()

# Calcul des scores par CV
rmse_scores = cross_val_score(model_baseline, X_baseline, y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(model_baseline, X_baseline, y_full, scoring=scorer_mae, cv=kf)

rmse_cv_baseline = np.mean(rmse_scores)
mae_cv_baseline = np.mean(mae_scores)

tracking_results['BASELINE'] = {
    'Modèle': 'Linear Regression',
    'Features': 'Ratios T seulement',
    'RMSE_CV_Moyenne': rmse_cv_baseline,
    'MAE_CV_Moyenne': mae_cv_baseline
}

print(f"\n--- 1. BASELINE (CV) --- RMSE: {rmse_cv_baseline:,.2f} | MAE: {mae_cv_baseline:,.2f}")


--- 1. BASELINE (CV) --- RMSE: nan | MAE: 643,319.40


In [6]:
# --- 2. ITÉRATION 1 (Linear Regression - Ratios + Deltas) ---
features_iteration_1 = features_baseline + [
    "delta_ResultatNet_1an",
    "delta_CA_1an",
    "ResultatNet_T_moins_1",
    "CA_T_moins_1"
]

X_iter_1 = X_full[features_iteration_1]
model_iter_1 = LinearRegression()

rmse_scores = cross_val_score(model_iter_1, X_iter_1, y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(model_iter_1, X_iter_1, y_full, scoring=scorer_mae, cv=kf)

rmse_cv_iter_1 = np.mean(rmse_scores)
mae_cv_iter_1 = np.mean(mae_scores)

tracking_results['ITER_1'] = {
    'Modèle': 'Linear Regression',
    'Features': 'Ratios T + Deltas T-1',
    'RMSE_CV_Moyenne': rmse_cv_iter_1,
    'MAE_CV_Moyenne': mae_cv_iter_1
}

print(f"--- 2. ITÉRATION 1 (CV) --- RMSE: {rmse_cv_iter_1:,.2f} | MAE: {mae_cv_iter_1:,.2f}")

--- 2. ITÉRATION 1 (CV) --- RMSE: nan | MAE: 585,098.62


In [7]:
# --- 3. ITÉRATION 2 (Linear Regression - Scaled Data) ---
X_iter_2 = X_full[features_iteration_1]

# La mise à l'échelle doit être intégrée dans un Pipeline pour la CV pour éviter le data leakage
from sklearn.pipeline import Pipeline

pipeline_iter_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

rmse_scores = cross_val_score(pipeline_iter_2, X_iter_2, y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(pipeline_iter_2, X_iter_2, y_full, scoring=scorer_mae, cv=kf)

rmse_cv_iter_2 = np.mean(rmse_scores)
mae_cv_iter_2 = np.mean(mae_scores)

tracking_results['ITER_2'] = {
    'Modèle': 'Linear Regression + Scaled',
    'Features': 'Ratios T + Deltas T-1 (SCALÉS)',
    'RMSE_CV_Moyenne': rmse_cv_iter_2,
    'MAE_CV_Moyenne': mae_cv_iter_2
}

print(f"--- 3. ITÉRATION 2 (CV) --- RMSE: {rmse_cv_iter_2:,.2f} | MAE: {mae_cv_iter_2:,.2f}")

--- 3. ITÉRATION 2 (CV) --- RMSE: nan | MAE: 585,098.62


In [8]:
# --- 4. ITÉRATION 3 (Random Forest Regressor) ---
X_iter_3 = X_full[features_iteration_1]

# Hyperparamètres simples (pas besoin de scaler)
model_iter_3 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)

rmse_scores = cross_val_score(model_iter_3, X_iter_3, y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(model_iter_3, X_iter_3, y_full, scoring=scorer_mae, cv=kf)

rmse_cv_iter_3 = np.mean(rmse_scores)
mae_cv_iter_3 = np.mean(mae_scores)

tracking_results['ITER_3'] = {
    'Modèle': 'Random Forest Regressor',
    'Features': 'Ratios T + Deltas T-1',
    'RMSE_CV_Moyenne': rmse_cv_iter_3,
    'MAE_CV_Moyenne': mae_cv_iter_3
}

print(f"--- 4. ITÉRATION 3 (CV) --- RMSE: {rmse_cv_iter_3:,.2f} | MAE: {mae_cv_iter_3:,.2f}")

--- 4. ITÉRATION 3 (CV) --- RMSE: nan | MAE: 563,600.62


In [9]:
print("\n" + "="*50)
print("Synthèse des Expérimentations (Experimentation Tracking)")
print("="*50)

df_tracking = pd.DataFrame(tracking_results).T
df_tracking['RMSE_CV_Moyenne'] = df_tracking['RMSE_CV_Moyenne'].apply(lambda x: f'{x:,.2f}')
df_tracking['MAE_CV_Moyenne'] = df_tracking['MAE_CV_Moyenne'].apply(lambda x: f'{x:,.2f}')

print(df_tracking.to_markdown())

# Le meilleur modèle est celui avec le MAE le plus faible.
best_model_row = df_tracking['MAE_CV_Moyenne'].astype(str).str.replace(',', '').astype(float).idxmin()
print(f"\nConclusion: Le meilleur modèle (basé sur le MAE) est l'{best_model_row}, qui est un modèle non-linéaire adapté à l'asymétrie des données financières.")


Synthèse des Expérimentations (Experimentation Tracking)
|          | Modèle                     | Features                       |   RMSE_CV_Moyenne | MAE_CV_Moyenne   |
|:---------|:---------------------------|:-------------------------------|------------------:|:-----------------|
| BASELINE | Linear Regression          | Ratios T seulement             |               nan | 643,319.40       |
| ITER_1   | Linear Regression          | Ratios T + Deltas T-1          |               nan | 585,098.62       |
| ITER_2   | Linear Regression + Scaled | Ratios T + Deltas T-1 (SCALÉS) |               nan | 585,098.62       |
| ITER_3   | Random Forest Regressor    | Ratios T + Deltas T-1          |               nan | 563,600.62       |

Conclusion: Le meilleur modèle (basé sur le MAE) est l'ITER_3, qui est un modèle non-linéaire adapté à l'asymétrie des données financières.


In [10]:
df_ml.schema

Schema([('siren', String),
        ('date_cloture_exercice', Date),
        ('CJCK_TotalActifBrut', Int32),
        ('EG_ImpotsTaxes', Int32),
        ('FJ_ResultatFinancier', Int32),
        ('FA_ChiffreAffairesVentes', Int32),
        ('HN_RésultatNet', Int32),
        ('DA_TresorerieActive', Int32),
        ('DL_DettesCourtTerme', Int32),
        ('FB_AchatsMarchandises', Int32),
        ('FR_ResultatExceptionnel', Int32),
        ('DF_CapitauxPropres', Int32),
        ('DM_DettesLongTerme', Int32),
        ('AnneeClotureExercice', Int32),
        ('ratio_rentabilite_nette', Float64),
        ('ratio_endettement', Float64),
        ('ratio_marge_brute', Float64),
        ('ratio_capitaux_propres', Float64),
        ('ratio_tresorerie', Float64),
        ('ratio_resultat_financier', Float64),
        ('ratio_resultat_exceptionnel', Float64),
        ('cible_HN_RésultatNet_T_plus_1', Int32),
        ('ResultatNet_T_moins_1', Int32),
        ('CA_T_moins_1', Int32),
        ('ResultatN

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score
import numpy as np

# --- 1. Définition du Jeu de Features Maximal (CORRIGÉ) ---
# Liste exhaustive des features d'entrée (X) présentes dans le DataFrame X_full.
# On exclut toutes les colonnes d'identifiant, de temps, et la cible qui ne sont plus dans X_full
features_maximales = [
    'CJCK_TotalActifBrut', 'EG_ImpotsTaxes', 'FJ_ResultatFinancier', 
    'FA_ChiffreAffairesVentes', 'HN_RésultatNet', 'DA_TresorerieActive', 
    'DL_DettesCourtTerme', 'FB_AchatsMarchandises', 'FR_ResultatExceptionnel', 
    'DF_CapitauxPropres', 'DM_DettesLongTerme', 
    
    # Ratios T
    'ratio_rentabilite_nette', 'ratio_endettement', 'ratio_marge_brute', 
    'ratio_capitaux_propres', 'ratio_tresorerie', 'ratio_resultat_financier', 
    'ratio_resultat_exceptionnel', 
    
    # Features T-1 et T-2
    'ResultatNet_T_moins_1', 'CA_T_moins_1', 'ResultatNet_T_moins_2', 'CA_T_moins_2', 
    
    # Deltas
    'delta_ResultatNet_1an', 'delta_CA_1an', 'delta_ResultatNet_2ans', 
    'delta_CA_2ans'
]

# On filtre l'ensemble X_full sur cette liste.
# NOTE: Si X_full contient encore d'autres colonnes non listées, cette ligne peut échouer.
X_full_max = X_full[features_maximales] 


# --- 2. Entraînement Initial pour l'Importance (Modèle de référence) ---
model_importance = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)

# Entraînement simple (sans CV) pour obtenir l'importance des features
# On utilise ici l'ensemble X_full (T=2017) comme ensemble d'entraînement unique
model_importance.fit(X_full_max, y_full)

# --- 3. Sélection des Top 5 Features ---
importances = model_importance.feature_importances_
feature_series = pd.Series(importances, index=X_full_max.columns).sort_values(ascending=False)

top_n = 5
features_selectionnees = feature_series.head(top_n).index.tolist()

print("\n--- 3.1 Importance des Features (Top 5) ---")
print(feature_series.head(top_n))
print("-" * 35)


# --- 4. Nouvel Entraînement avec les Features Sélectionnées (Itération 4) ---
X_iter_4 = X_full[features_selectionnees] # Utilisation du sous-ensemble sélectionné

# Modèle Random Forest final
model_iter_4 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)

# Calcul des scores par Cross-Validation (CV)
rmse_scores = cross_val_score(model_iter_4, X_iter_4, y_full, scoring=make_scorer(mean_squared_error, squared=False), cv=kf)
mae_scores = cross_val_score(model_iter_4, X_iter_4, y_full, scoring=make_scorer(mean_absolute_error), cv=kf)

rmse_cv_iter_4 = np.mean(rmse_scores)
mae_cv_iter_4 = np.mean(mae_scores)

# Documentation pour l'Experimentation Tracking
tracking_results['ITER_4'] = {
    'Modèle': 'Random Forest Regressor',
    'Features': f'Sélection Top {top_n} : {features_selectionnees}',
    'RMSE_CV_Moyenne': rmse_cv_iter_4,
    'MAE_CV_Moyenne': mae_cv_iter_4
}

print("\n--- 3.2 ITÉRATION 4 (Sélection de Features par RF) ---")
print(f"Features retenues : {features_selectionnees}")
print(f"RMSE (Moyenne CV, n=5) : {rmse_cv_iter_4:,.2f}")
print(f"MAE (Moyenne CV, n=5) : {mae_cv_iter_4:,.2f}")


--- 3.1 Importance des Features (Top 5) ---
HN_RésultatNet             0.199127
ratio_rentabilite_nette    0.183555
ResultatNet_T_moins_1      0.173644
ResultatNet_T_moins_2      0.155053
delta_ResultatNet_2ans     0.092726
dtype: float64
-----------------------------------

--- 3.2 ITÉRATION 4 (Sélection de Features par RF) ---
Features retenues : ['HN_RésultatNet', 'ratio_rentabilite_nette', 'ResultatNet_T_moins_1', 'ResultatNet_T_moins_2', 'delta_ResultatNet_2ans']
RMSE (Moyenne CV, n=5) : nan
MAE (Moyenne CV, n=5) : 527,764.89


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import xgboost as xgb # Nouvelle librairie !

# Assurez-vous d'avoir installé xgboost : pip install xgboost

# --- 1. Définition des Features Finales (Top 5 de l'Itération 4) ---
features_selectionnees = ['HN_RésultatNet', 'ratio_rentabilite_nette', 'ResultatNet_T_moins_1', 'ResultatNet_T_moins_2', 'delta_ResultatNet_2ans']

X_iter_5 = X_full[features_selectionnees] # Utilisation du sous-ensemble sélectionné
y_full = y_full.astype(np.float64) # Conversion de la cible pour stabiliser le calcul

# --- 2. Définition du Modèle et Hyperparamètres (XGBoost) ---
# Modèle plus puissant
model_iter_5 = xgb.XGBRegressor(
    n_estimators=200,          # Plus d'arbres que le Random Forest
    learning_rate=0.05,        # Taux d'apprentissage plus faible pour la précision
    max_depth=6,               # Profondeur limitée
    random_state=42,
    n_jobs=-1
)

# --- 3. Cross-Validation (CV) ---

# Définition des scoreurs (conversion RMSE en float64 pour éviter le nan)
# Nous convertissons les labels (y_full) et les prédictions dans le scoreur
def safe_rmse_scorer(y_true, y_pred):
    return mean_squared_error(y_true.astype(np.float64), y_pred.astype(np.float64), squared=False)

scorer_rmse = make_scorer(safe_rmse_scorer)
scorer_mae = make_scorer(mean_absolute_error)

# Calcul des scores par CV
rmse_scores = cross_val_score(model_iter_5, X_iter_5, y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(model_iter_5, X_iter_5, y_full, scoring=scorer_mae, cv=kf)

rmse_cv_iter_5 = np.mean(rmse_scores)
mae_cv_iter_5 = np.mean(mae_scores)

# Documentation pour l'Experimentation Tracking
tracking_results['ITER_5'] = {
    'Modèle': 'XGBoost Regressor',
    'Features': f'Top 5 : {features_selectionnees}',
    'RMSE_CV_Moyenne': rmse_cv_iter_5,
    'MAE_CV_Moyenne': mae_cv_iter_5
}

print("\n--- ITÉRATION 5 (XGBoost - Sélection de Features) ---")
print(f"Features retenues : {features_selectionnees}")
print(f"RMSE (Moyenne CV, n=5) : {rmse_cv_iter_5:,.2f}")
print(f"MAE (Moyenne CV, n=5) : {mae_cv_iter_5:,.2f}")


--- ITÉRATION 5 (XGBoost - Sélection de Features) ---
Features retenues : ['HN_RésultatNet', 'ratio_rentabilite_nette', 'ResultatNet_T_moins_1', 'ResultatNet_T_moins_2', 'delta_ResultatNet_2ans']
RMSE (Moyenne CV, n=5) : nan
MAE (Moyenne CV, n=5) : 569,748.17


In [15]:
df_ml.describe()

statistic,siren,date_cloture_exercice,CJCK_TotalActifBrut,EG_ImpotsTaxes,FJ_ResultatFinancier,FA_ChiffreAffairesVentes,HN_RésultatNet,DA_TresorerieActive,DL_DettesCourtTerme,FB_AchatsMarchandises,FR_ResultatExceptionnel,DF_CapitauxPropres,DM_DettesLongTerme,AnneeClotureExercice,ratio_rentabilite_nette,ratio_endettement,ratio_marge_brute,ratio_capitaux_propres,ratio_tresorerie,ratio_resultat_financier,ratio_resultat_exceptionnel,cible_HN_RésultatNet_T_plus_1,ResultatNet_T_moins_1,CA_T_moins_1,ResultatNet_T_moins_2,CA_T_moins_2,delta_ResultatNet_1an,delta_CA_1an,delta_ResultatNet_2ans,delta_CA_2ans
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""29960""","""29960""",29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,"""2017-11-29 11:45:23.311081""",3700300.0,1221700.0,4084200.0,1882200.0,307176.811048,1384300.0,3183500.0,209267.54002,6233600.0,47519.787517,35374.448097,2017.0,237440000000.0,224560000000.0,-27344000000.0,3116900.0,72426000000.0,1384100000000.0,3227300000000.0,317851.048131,282025.301802,1834400.0,90660.980808,1839000.0,25151.509246,47803.595728,216515.83024,43184.551035
"""std""",,,37960000.0,14049000.0,44543000.0,35209000.0,14429000.0,27779000.0,52402000.0,6491900.0,61211000.0,4926200.0,3399000.0,0.0,14219000000000.0,19155000000000.0,2382600000000.0,534110000.0,6628900000000.0,18111000000000.0,44390000000000.0,11499000.0,14822000.0,33521000.0,18062000.0,33461000.0,9841800.0,7763900.0,21025000.0,7787600.0
"""min""","""005450119""","""2017-01-01""",-664170.0,-1256488.0,-1139370.0,-78658.0,-281860000.0,-2147300000.0,-2147500000.0,-111906.0,-1832600000.0,0.0,0.0,2017.0,-281860000000000.0,-799000000000000.0,-398290000000000.0,0.0,-929.977761,-1139400000000.0,-1248000000000000.0,-267550000.0,-969400000.0,-543000.0,-1855700000.0,-7975916.0,-556410000.0,-303650000.0,-1062400000.0,-253830000.0
"""25%""",,"""2017-12-31""",120419.0,0.0,0.0,0.0,0.0,7622.0,38102.0,0.0,0.0,0.0,0.0,2017.0,0.0,0.200536,0.0,0.0,0.023534,0.0,0.0,0.0,0.0,0.0,-1134.0,0.0,-17059.0,0.0,-29337.0,0.0
"""50%""",,"""2017-12-31""",449449.0,114133.0,145561.0,0.0,6111.0,30500.0,241807.0,0.0,285295.0,0.0,0.0,2017.0,0.060985,0.55947,0.0,0.0,0.081759,1.295581,1940000000.0,3583.0,7239.0,0.0,10486.0,0.0,0.0,0.0,0.0,0.0
"""75%""",,"""2017-12-31""",1374439.0,537141.0,1112941.0,0.0,73391.0,152400.0,823346.0,0.0,1514840.0,0.0,0.0,2017.0,29446000000.0,0.99239,0.0,0.0,0.308944,260640000000.0,523510000000.0,69854.0,69804.0,51.0,72542.0,10066.0,24948.0,0.0,38269.0,0.0
"""max""","""998620116""","""2017-12-31""",2147500000.0,1507700000.0,2147500000.0,2147500000.0,1924000000.0,1564200000.0,2147500000.0,632981128.0,2147500000.0,829010242.0,394980000.0,2017.0,1924000000000000.0,2147500000000000.0,31421000000.0,92447000000.0,980000000000000.0,1687800000000000.0,2147500000000000.0,1591000000.0,2147500000.0,2147500000.0,1057400000.0,2147500000.0,1093900000.0,1040500000.0,1935900000.0,1030000000.0


In [None]:
import polars as pl
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import numpy as np
import warnings
from typing import Dict, Any

warnings.filterwarnings('ignore')

# --- 0. Chargement des Données et Préparation ---
FILE_PATH_ML = "../Data/processed/sirene_bilan_ML_prets.parquet" 
cible_col = "cible_HN_RésultatNet_T_plus_1"

try:
    df_ml = pl.read_parquet(FILE_PATH_ML)
except Exception as e:
    print(f"ERREUR : Impossible de charger le fichier ML. Vérifiez le chemin : {FILE_PATH_ML}")
    raise

df_ml_pd = df_ml.to_pandas()

# Définition de l'ensemble X et Y
# On utilise la cible comme float64 pour éviter les erreurs d'overflow RMSE
y_full = df_ml_pd[cible_col].astype(np.float64) 

# Liste exhaustive des features d'entrée (X)
features_maximales = [
    'CJCK_TotalActifBrut', 'EG_ImpotsTaxes', 'FJ_ResultatFinancier', 'FA_ChiffreAffairesVentes', 
    'HN_RésultatNet', 'DA_TresorerieActive', 'DL_DettesCourtTerme', 'FB_AchatsMarchandises', 
    'FR_ResultatExceptionnel', 'DF_CapitauxPropres', 'DM_DettesLongTerme', 'AnneeClotureExercice', 
    'ratio_rentabilite_nette', 'ratio_endettement', 'ratio_marge_brute', 'ratio_capitaux_propres', 
    'ratio_tresorerie', 'ratio_resultat_financier', 'ratio_resultat_exceptionnel', 
    'ResultatNet_T_moins_1', 'CA_T_moins_1','delta_ResultatNet_1an', 'delta_CA_1an'
]
X_full = df_ml_pd[features_maximales].fillna(0) # Imputation simple des NaNs avant le modèle

# --- 1. Configuration de la Cross-Validation (CV) ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Définition des scoreurs (avec correction du type pour le RMSE)
def safe_rmse_scorer(y_true, y_pred):
    return mean_squared_error(y_true.astype(np.float64), y_pred.astype(np.float64), squared=False)

scorer_rmse = make_scorer(safe_rmse_scorer)
scorer_mae = make_scorer(mean_absolute_error)

# --- 2. Modèle XGBoost (Hyperparamètres agressifs) ---
model_xgb = xgb.XGBRegressor(
    n_estimators=300,            # Beaucoup d'arbres
    learning_rate=0.03,          # Petit pas d'apprentissage pour plus de précision
    max_depth=7,                 # Profondeur suffisante
    subsample=0.7,               # Utiliser un sous-échantillon des lignes (pour la robustesse)
    colsample_bytree=0.7,        # Utiliser un sous-échantillon des colonnes (pour la robustesse)
    random_state=42,
    n_jobs=-1,
    tree_method='hist'           # Améliore la vitesse
)

# --- 3. Calcul des Scores par Cross-Validation (CV) ---
print(f"Démarrage de l'Itération 1 (XGBoost / {len(features_maximales)} features) avec Cross-Validation...")

rmse_scores = cross_val_score(model_xgb, X_full, y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(model_xgb, X_full, y_full, scoring=scorer_mae, cv=kf)

rmse_cv_iter_1 = np.mean(rmse_scores)
mae_cv_iter_1 = np.mean(mae_scores)

# --- 4. Affichage du Résultat ---
tracking_results = {
    'ITER_1_AGRESSIVE': {
        'Modèle': 'XGBoost Regressor',
        'Features': f'Maximales (N={len(features_maximales)} features)',
        'RMSE_CV_Moyenne': rmse_cv_iter_1,
        'MAE_CV_Moyenne': mae_cv_iter_1
    }
}

print("\n--- ITÉRATION 1 (XGBoost Max Features) ---")
print(f"RMSE (Moyenne CV, n=5) : {rmse_cv_iter_1:,.2f}")
print(f"MAE (Moyenne CV, n=5) : {mae_cv_iter_1:,.2f}")

Démarrage de l'Itération 1 (XGBoost / 23 features) avec Cross-Validation...


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=0.

In [17]:
df_ml_pd.describe()

Unnamed: 0,date_cloture_exercice,CJCK_TotalActifBrut,EG_ImpotsTaxes,FJ_ResultatFinancier,FA_ChiffreAffairesVentes,HN_RésultatNet,DA_TresorerieActive,DL_DettesCourtTerme,FB_AchatsMarchandises,FR_ResultatExceptionnel,...,ratio_resultat_exceptionnel,cible_HN_RésultatNet_T_plus_1,ResultatNet_T_moins_1,CA_T_moins_1,ResultatNet_T_moins_2,CA_T_moins_2,delta_ResultatNet_1an,delta_CA_1an,delta_ResultatNet_2ans,delta_CA_2ans
count,29960,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,...,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0,29960.0
mean,2017-11-29 11:45:23.311000,3700343.0,1221733.0,4084171.0,1882187.0,307176.8,1384349.0,3183492.0,209267.5,6233579.0,...,3227332000000.0,317851.0,282025.3,1834384.0,90660.98,1839003.0,25151.51,47803.6,216515.8,43184.55
min,2017-01-01 00:00:00,-664170.0,-1256488.0,-1139370.0,-78658.0,-281863900.0,-2147319000.0,-2147474000.0,-111906.0,-1832627000.0,...,-1248038000000000.0,-267553000.0,-969403000.0,-543000.0,-1855684000.0,-7975916.0,-556405700.0,-303651100.0,-1062435000.0,-253831000.0
25%,2017-12-31 00:00:00,120417.8,0.0,0.0,0.0,0.0,7622.0,38099.25,0.0,0.0,...,0.0,0.0,0.0,0.0,-1135.75,0.0,-17059.5,0.0,-29338.25,0.0
50%,2017-12-31 00:00:00,449434.0,114116.5,145550.0,0.0,6111.0,30500.0,241774.5,0.0,285292.0,...,1922000000.0,3582.5,7237.0,0.0,10484.0,0.0,0.0,0.0,0.0,0.0
75%,2017-12-31 00:00:00,1374461.0,537169.8,1112943.0,0.0,73393.25,152400.0,823358.8,0.0,1515148.0,...,523520500000.0,69861.75,69804.5,51.5,72542.75,10067.25,24948.0,0.0,38273.0,0.0
max,2017-12-31 00:00:00,2147484000.0,1507690000.0,2147484000.0,2147484000.0,1924000000.0,1564155000.0,2147484000.0,632981100.0,2147484000.0,...,2147484000000000.0,1591000000.0,2147484000.0,2147484000.0,1057350000.0,2147484000.0,1093915000.0,1040540000.0,1935943000.0,1030014000.0
std,,37960470.0,14049400.0,44542650.0,35209020.0,14429250.0,27778790.0,52402470.0,6491862.0,61211420.0,...,44390130000000.0,11499180.0,14821550.0,33520610.0,18061970.0,33461050.0,9841849.0,7763925.0,21025460.0,7787572.0


In [21]:
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import time
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

# Configuration et fonctions (omises ici pour la concision)

# --- NOUVEAUX PARAMÈTRES POUR FORCER UN APPRENTISSAGE PROFOND ---
# Ces paramètres vont créer un modèle beaucoup plus lourd et précis.
N_ESTIMATORS_FINAL = 1000
MAX_DEPTH_FINAL = 8 

# --- 1. CHARGEMENT ET PRÉPARATION DES DONNÉES (Identique) ---
FILE_PATH_ML = "../Data/processed/sirene_bilan_ML_prets.parquet" 
cible_col = "cible_HN_RésultatNet_T_plus_1"

# [Code de chargement et de définition des ensembles X_full, Y_full, FEATURES_DELTAS]
# ... (Assurez-vous que ces variables sont bien définies dans votre notebook) ...

# Préparation X et Y (Conversion des types pour la robustesse numérique)
df_ml = pl.read_parquet(FILE_PATH_ML)
df_ml_pd = df_ml.to_pandas()
Y_full = df_ml_pd[cible_col].astype(np.float64) 
X_full = df_ml_pd[[col for col in FEATURES_DELTAS if col in df_ml_pd.columns]].fillna(0).astype(np.float64)


# --- 2. PIPELINE DE MODÉLISATION (Modèle 'Monstre') ---
print("--- Démarrage de l'Entraînement Profond (Modèle Génie) ---")
start_time = time.time()

# Définition du pré-traitement (StandardScaler)
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), FEATURES_DELTAS)],
    remainder='passthrough'
)

# Modèle XGBoost avec des paramètres maximisés
model_xgb_monstre = xgb.XGBRegressor(
    n_estimators=N_ESTIMATORS_FINAL,        # 5x plus d'arbres
    max_depth=MAX_DEPTH_FINAL,              # Arbres plus complexes
    learning_rate=0.03,                     # Taux d'apprentissage très fin
    random_state=42, 
    n_jobs=-1,
    objective='reg:squarederror'
)

# Pipeline complet
pipeline_monstre = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model_xgb_monstre)
])

# --- 3. ÉVALUATION FINALE (5-Fold CV) ---
rmse_scores = cross_val_score(pipeline_monstre, X_full, Y_full, scoring=scorer_rmse, cv=kf)
mae_scores = cross_val_score(pipeline_monstre, X_full, Y_full, scoring=scorer_mae, cv=kf)

end_time = time.time()
training_time = end_time - start_time

# Calcul des moyennes finales
rmse_cv_mean = np.mean(rmse_scores) * -1
mae_cv_mean = np.mean(mae_scores) * -1

print("\n=============================================")
print("✅ Modèle Final Entraîné (Modèle Monstre XGBoost)")
print(f"TEMPS D'ENTRAÎNEMENT RÉEL : {training_time:.2f} secondes")
print(f"  (Ce temps est le minimum pour {N_ESTIMATORS_FINAL} arbres)")
print(f"  > MAE (Moyenne 5-Fold CV) : {mae_cv_mean:,.2f}")
print(f"  > RMSE (Moyenne 5-Fold CV) : {rmse_cv_mean:,.2f}")
print("=============================================")

--- Démarrage de l'Entraînement Profond (Modèle Génie) ---

✅ Modèle Final Entraîné (Modèle Monstre XGBoost)
TEMPS D'ENTRAÎNEMENT RÉEL : 23.13 secondes
  (Ce temps est le minimum pour 1000 arbres)
  > MAE (Moyenne 5-Fold CV) : 600,246.20
  > RMSE (Moyenne 5-Fold CV) : 10,050,285.38


In [26]:
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import time
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

# --- 0. DÉFINITION GLOBALE DES FONCTIONS ROBUSTES (CORRIGÉES) ---

# Fonction de transformation robuste de la cible (ARCSINH STANDARD)
def arcsinh_transform_safe(y):
    # Utilisation directe de np.arcsinh(y) pour la stabilité sur les grandes valeurs et les négatifs.
    # On ajoute np.finfo(float).eps pour assurer la robustesse près de zéro avant transformation.
    return np.arcsinh(y + np.finfo(float).eps)

# Fonction d'inverse transformation
def inv_arcsinh_transform_safe(y_pred_arcsinh):
    # Inverse de arcsinh : np.sinh(x)
    return np.sinh(y_pred_arcsinh) - np.finfo(float).eps

scorer_mae = make_scorer(mean_absolute_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 1. CHARGEMENT ET PRÉPARATION (Application de la transformation arcsinh SÛRE) ---
FILE_PATH_ML = "../Data/processed/sirene_bilan_ML_prets.parquet" 
cible_col = "cible_HN_RésultatNet_T_plus_1"

try:
    df_ml = pl.read_parquet(FILE_PATH_ML)
except Exception:
    raise RuntimeError("Erreur de chargement du Parquet. Vérifiez le chemin.")

df_ml_pd = df_ml.to_pandas()

# Application de la transformation ARCSINH SÛRE à la Cible (Y)
Y_full_arcsinh = arcsinh_transform_safe(df_ml_pd[cible_col].astype(np.float64))

# Définition des Features (Ratios T + Deltas T-1)
FEATURES_FINALES = [
    'ratio_rentabilite_nette', 'ratio_endettement', 'ratio_marge_brute', 
    'HN_RésultatNet', 'FA_ChiffreAffairesVentes', 
    'delta_ResultatNet_1an', 'delta_CA_1an', 'ResultatNet_T_moins_1', 'CA_T_moins_1'
]
X_full = df_ml_pd[FEATURES_FINALES].fillna(0).astype(np.float64) 


# --- 2. DÉFINITION DU PIPELINE ET GRIDSEARCH ---

preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), FEATURES_FINALES)],
    remainder='passthrough'
)

model_xgb = xgb.XGBRegressor(
    random_state=42, n_jobs=-1, objective='reg:squarederror'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model_xgb)
])

# Grille d'hyperparamètres (Conçue pour prendre du temps)
param_grid = {
    'regressor__n_estimators': [500, 1000],  
    'regressor__max_depth': [6, 10],        
    'regressor__learning_rate': [0.03, 0.05] 
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scorer_mae,
    cv=kf,
    verbose=2,
    n_jobs=-1  
)


# --- 3. EXÉCUTION DE L'ENTRAÎNEMENT (LE LONG PROCESSUS) ---
print("\n--- DÉMARRAGE DE L'OPTIMISATION PAR GRILLE (LE MONSTRE APPREND) ---")
print(f"Nombre de modèles à tester : {len(param_grid['regressor__n_estimators']) * len(param_grid['regressor__max_depth']) * len(param_grid['regressor__learning_rate']) * kf.get_n_splits()} fits.")
start_time = time.time()

# Entraînement sur la cible ARCSINH-TRANSFORMÉE
grid_search.fit(X_full, Y_full_arcsinh)

end_time = time.time()
training_time = end_time - start_time


# --- 4. ÉVALUATION FINALE ET INVERSE-TRANSFORMATION ---
best_model = grid_search.best_estimator_

# Prédiction sur l'ensemble complet (Arcsih-transformé)
Y_pred_arcsinh = best_model.predict(X_full)

# Inverse Transformation des résultats
Y_pred_final_unscaled = inv_arcsinh_transform_safe(Y_pred_arcsinh)
Y_true_unscaled = inv_arcsinh_transform_safe(Y_full_arcsinh)

# Calcul des métriques sur les valeurs ORIGINALES
final_mae = mean_absolute_error(Y_true_unscaled, Y_pred_final_unscaled)
final_rmse = root_mean_squared_error(Y_true_unscaled, Y_pred_final_unscaled)


print("\n=============================================")
print("🏆 MODÈLE MONSTRE FINAL : RÉSULTATS")
print(f"TEMPS TOTAL D'OPTIMISATION : {training_time/60:.2f} minutes")
print(f"MEILLEURS HYPERPARAMÈTRES : {grid_search.best_params_}")
print("-" * 45)
print(f"  > MAE FINAL (Erreur Absolue Moyenne) : {final_mae:,.2f}")
print(f"  > RMSE FINAL : {final_rmse:,.2f}")
print("=============================================")


--- DÉMARRAGE DE L'OPTIMISATION PAR GRILLE (LE MONSTRE APPREND) ---
Nombre de modèles à tester : 40 fits.
Fitting 5 folds for each of 8 candidates, totalling 40 fits


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=0.