# Import des modules


In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
from sklearn.pipeline import Pipeline

#Selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance

#Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MultiLabelBinarizer, MinMaxScaler

#Mod√®les
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

#Metriques
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


In [100]:
fc = pd.read_csv('fc_after_feature_engineering.csv')
print(fc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 40 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   a_quitte_l_entreprise                      1470 non-null   bool   
 1   age                                        1470 non-null   int64  
 2   annees_dans_l_entreprise                   1470 non-null   int64  
 3   annees_dans_le_poste_actuel                1470 non-null   int64  
 4   annees_depuis_la_derniere_promotion        1470 non-null   int64  
 5   annees_experience_totale                   1470 non-null   int64  
 6   annes_sous_responsable_actuel              1470 non-null   int64  
 7   augmentation_salaire_precedente            1470 non-null   int64  
 8   distance_domicile_travail                  1470 non-null   int64  
 9   domaine_etude_Entrepreunariat              1470 non-null   float64
 10  domaine_etude_Infra & Cl

# Dataframes
- Un DataFrame contenant les features => X
- Un Pandas Series contenant la colonne cible => y

In [101]:
#Un homme , C√©libataire , Entre 30 et 40 ans , Consultant , Entre 1 et 7 ann√©es dans l‚Äôentreprise, Un revenu compris entre 2500‚Ç¨ et 6000‚Ç¨, Domaine d‚Äôetude Infra & Cloud, Qui a moins de 5 ann√©es sous son responsable actuel, Qui a une distance domicile travail entre 3 et 17km

columns_base = ['genre', 'statut_marital', 'age', 'annees_dans_l_entreprise', 'revenu_mensuel', 'distance_domicile_travail', 'satisfaction_globale']

columns_domaine_etude = [col for col in fc.columns if col.startswith('domaine_etude')]
columns_poste = [col for col in fc.columns if col.startswith('poste')]

all_columns = columns_base + columns_domaine_etude + columns_poste

X = fc[all_columns]
y = fc['a_quitte_l_entreprise']

#X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=666)

# S√©paration Train Test
- Des m√©triques d‚Äô√©valuation calcul√©es pour chaque mod√®le, sur le jeu d‚Äôapprentissage et le jeu de test.

In [102]:
def train_test_evaluation(model, X_train, X_test, y_train, y_test, model_name):
    """
    Entra√Æne un mod√®le et retourne les m√©triques train/test avec d√©tection d'overfitting.

    Args:
        model: Le mod√®le √† entra√Æner
        X_train, X_test: Features d'entra√Ænement et de test
        y_train, y_test: Labels d'entra√Ænement et de test
        model_name: Nom du mod√®le (string)

    Returns:
        dict: Dictionnaire contenant les r√©sultats avec m√©triques train et test
    """
    # Entra√Ænement
    model.fit(X_train, y_train)

    # Pr√©dictions sur TRAIN
    y_pred_train = model.predict(X_train)
    report_train = classification_report(y_train, y_pred_train, output_dict=True)

    # Pr√©dictions sur TEST
    y_pred_test = model.predict(X_test)
    report_test = classification_report(y_test, y_pred_test, output_dict=True)

    # Matrice de confusion (test)
    cm = confusion_matrix(y_test, y_pred_test)

    # Calcul des √©carts (train - test) pour d√©tecter l'overfitting
    accuracy_gap = report_train['accuracy'] - report_test['accuracy']
    f1_gap = report_train['macro avg']['f1-score'] - report_test['macro avg']['f1-score']

    # Indicateur d'overfitting (seuil √† 5% d'√©cart)
    overfitting_flag = 'OUI' if (accuracy_gap > 0.05 or f1_gap > 0.05) else 'NON'

    # Extraction des m√©triques principales
    results = {
        'model': model_name,
        'method': 'train_test',
        # M√©triques TRAIN
        'train_accuracy': report_train['accuracy'],
        'train_f1_macro': report_train['macro avg']['f1-score'],
        'train_precision_macro': report_train['macro avg']['precision'],
        'train_recall_macro': report_train['macro avg']['recall'],
        # M√©triques TEST
        'test_accuracy': report_test['accuracy'],
        'test_f1_macro': report_test['macro avg']['f1-score'],
        'test_precision_macro': report_test['macro avg']['precision'],
        'test_recall_macro': report_test['macro avg']['recall'],
        # √âcarts et overfitting
        'accuracy_gap': accuracy_gap,
        'f1_gap': f1_gap,
        'overfitting': overfitting_flag,
        'confusion_matrix': str(cm.tolist())
    }

    return results, report_test, cm

# Validation crois√©e
- cross_validate

In [103]:
def cross_validation_evaluation(model, X, y, model_name, cv=5):
    """
    Effectue une validation crois√©e et retourne les m√©triques avec d√©tection d'overfitting.

    Args:
        model: Le mod√®le √† √©valuer
        X: Features compl√®tes
        y: Labels complets
        model_name: Nom du mod√®le (string)
        cv: Nombre de folds (d√©faut: 5)

    Returns:
        dict: Dictionnaire contenant les r√©sultats moyens avec scores train et test
    """
    # D√©finition des m√©triques √† calculer
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

    # Validation crois√©e avec return_train_score=True pour d√©tecter overfitting
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring,
                                return_train_score=True)

    # Calcul des √©carts moyens (train - test)
    accuracy_gap_cv = cv_results['train_accuracy'].mean() - cv_results['test_accuracy'].mean()
    f1_gap_cv = cv_results['train_f1_macro'].mean() - cv_results['test_f1_macro'].mean()

    # Indicateur d'overfitting
    overfitting_flag_cv = 'OUI' if (accuracy_gap_cv > 0.05 or f1_gap_cv > 0.05) else 'NON'

    # Calcul des moyennes et √©carts-types
    results = {
        'model': model_name,
        'method': 'cross_validation',
        # M√©triques TRAIN
        'train_accuracy': cv_results['train_accuracy'].mean(),
        'train_accuracy_std': cv_results['train_accuracy'].std(),
        'train_f1_macro': cv_results['train_f1_macro'].mean(),
        'train_f1_macro_std': cv_results['train_f1_macro'].std(),
        'train_precision_macro': cv_results['train_precision_macro'].mean(),
        'train_recall_macro': cv_results['train_recall_macro'].mean(),
        # M√©triques TEST
        'test_accuracy': cv_results['test_accuracy'].mean(),
        'test_accuracy_std': cv_results['test_accuracy'].std(),
        'test_f1_macro': cv_results['test_f1_macro'].mean(),
        'test_f1_macro_std': cv_results['test_f1_macro'].std(),
        'test_precision_macro': cv_results['test_precision_macro'].mean(),
        'test_recall_macro': cv_results['test_recall_macro'].mean(),
        # √âcarts et overfitting
        'accuracy_gap': accuracy_gap_cv,
        'f1_gap': f1_gap_cv,
        'overfitting': overfitting_flag_cv,
        'confusion_matrix': 'N/A'
    }

    return results

# Comparaison des modeles

- DummyClassifier : strategy='most_frequent'
- RandomForestClassifier : n_estimators=100,max_depth=10
- XGBClassifier : eval_metric='logloss'
- CatBoostClassifier

In [104]:
def compare_models(X, y, test_size=0.2, random_state=666, cv_folds=5):
    """
    Compare tous les mod√®les et g√©n√®re les fichiers CSV de r√©sultats.

    Args:
        X: Features (DataFrame ou array)
        y: Labels (Series ou array)
        test_size: Proportion du jeu de test (d√©faut: 0.2)
        random_state: Seed pour la reproductibilit√© (d√©faut: 42)
        cv_folds: Nombre de folds pour la validation crois√©e (d√©faut: 5)

    Returns:
        tuple: (r√©sultats_df, rapports_d√©taill√©s)
    """
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # D√©finition des mod√®les
    models = {
        'DummyClassifier': DummyClassifier(
            strategy='most_frequent',
            random_state=random_state),

        'RandomForestClassifier': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            random_state=random_state),

        'XGBClassifier': XGBClassifier(
            max_depth=6,
            learning_rate=0.01,     # Plus petit = apprentissage plus lent
            subsample=0.8,          # 80% des donn√©es par arbre
            colsample_bytree=0.8,   # 80% des features par arbre
            reg_alpha=0.1,          # R√©gularisation L1
            reg_lambda=1.0,         # R√©gularisation L2
            min_child_weight=5,
            eval_metric='logloss',
            random_state=random_state),

        'CatBoostClassifier': CatBoostClassifier(
            depth=6,
            learning_rate=0.03,
            l2_leaf_reg=3.0,        # R√©gularisation
            subsample=0.8,
            verbose=False,
            random_state=random_state)
    }

    all_results = []
    detailed_reports = []

    print("=" * 70)
    print("COMPARAISON DES MOD√àLES DE CLASSIFICATION")
    print("=" * 70)

    for model_name, model in models.items():
        print(f"\n>>> √âvaluation de {model_name}...")

        # Train/Test
        print(f"  - Train/Test split...")
        tt_results, report, cm = train_test_evaluation(
            model, X_train, X_test, y_train, y_test, model_name
        )
        all_results.append(tt_results)

        # Sauvegarde du rapport d√©taill√©
        detailed_reports.append({
            'model': model_name,
            'method': 'train_test',
            'report': report,
            'confusion_matrix': cm
        })

        # Cross-validation
        print(f"  - Validation crois√©e ({cv_folds} folds)...")
        cv_results = cross_validation_evaluation(
            model, X, y, model_name, cv=cv_folds
        )
        all_results.append(cv_results)

        print(f"  ‚úì {model_name} termin√©")

    # Conversion en DataFrame
    results_df = pd.DataFrame(all_results)

    # R√©organisation des colonnes pour la lisibilit√©
    cols_order = ['model', 'method',
                  'train_accuracy', 'test_accuracy', 'accuracy_gap',
                  'train_f1_macro', 'test_f1_macro', 'f1_gap',
                  'overfitting',
                  'train_precision_macro', 'test_precision_macro',
                  'train_recall_macro', 'test_recall_macro']

    # Ajout des colonnes std si elles existent
    std_cols = [col for col in results_df.columns if '_std' in col]
    cols_order.extend(std_cols)
    cols_order.append('confusion_matrix')

    # Colonnes pr√©sentes dans le DataFrame
    cols_order = [col for col in cols_order if col in results_df.columns]
    results_df = results_df[cols_order]

    return results_df, detailed_reports

# Sauvegarde des r√©sultats
- classification_results_by_class.csv => Rapport de classification par classes
- classification_results_confusion_matrices.csv => Matrices de confusion
- classification_results_summary.csv => Tous les scores
- classification_results_overfitting_analysis.csv => Analyse sp√©cifique de l'overfitting

In [105]:
def save_results(results_df, detailed_reports, output_prefix='classification_results'):
    """
    Sauvegarde les r√©sultats dans des fichiers CSV.

    Args:
        results_df: DataFrame avec tous les r√©sultats
        detailed_reports: Liste des rapports d√©taill√©s
        output_prefix: Pr√©fixe pour les fichiers de sortie
    """
    # 1. Fichier principal avec toutes les m√©triques
    results_df.to_csv(f'{output_prefix}_summary.csv', index=False)
    print(f"\n‚úì R√©sum√© sauvegard√©: {output_prefix}_summary.csv")

    # 2. Fichier avec les rapports d√©taill√©s de classification
    detailed_data = []
    for item in detailed_reports:
        model = item['model']
        report = item['report']

        # Extraction des m√©triques par classe
        for class_label, metrics in report.items():
            if class_label not in ['accuracy', 'macro avg', 'weighted avg']:
                detailed_data.append({
                    'model': model,
                    'class': class_label,
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1-score': metrics['f1-score'],
                    'support': metrics['support']
                })

    detailed_df = pd.DataFrame(detailed_data)
    detailed_df.to_csv(f'{output_prefix}_by_class.csv', index=False)
    print(f"‚úì R√©sultats par classe sauvegard√©s: {output_prefix}_by_class.csv")

    # 3. Fichier avec les matrices de confusion
    cm_data = []
    for item in detailed_reports:
        cm_data.append({
            'model': item['model'],
            'confusion_matrix': str(item['confusion_matrix'].tolist())
        })

    cm_df = pd.DataFrame(cm_data)
    cm_df.to_csv(f'{output_prefix}_confusion_matrices.csv', index=False)
    print(f"‚úì Matrices de confusion sauvegard√©es: {output_prefix}_confusion_matrices.csv")

    # 4. Fichier sp√©cifique pour l'analyse d'overfitting
    overfitting_data = results_df[['model', 'method', 'train_accuracy', 'test_accuracy',
                                     'accuracy_gap', 'train_f1_macro', 'test_f1_macro',
                                     'f1_gap', 'overfitting']].copy()
    overfitting_data.to_csv(f'{output_prefix}_overfitting_analysis.csv', index=False)
    print(f"‚úì Analyse d'overfitting sauvegard√©e: {output_prefix}_overfitting_analysis.csv")

    print("\n" + "=" * 70)
    print("TOUS LES R√âSULTATS ONT √âT√â SAUVEGARD√âS")
    print("=" * 70)

    # Affichage d'un r√©sum√© de l'overfitting
    print("\nüìä R√âSUM√â DE L'OVERFITTING:")
    print("-" * 70)
    for _, row in overfitting_data.iterrows():
        status = "‚ö†Ô∏è  OVERFITTING D√âTECT√â" if row['overfitting'] == 'OUI' else "‚úì  Pas d'overfitting"
        print(f"{row['model']:25s} ({row['method']:17s}): {status}")
        print(f"  ‚Üí √âcart accuracy: {row['accuracy_gap']:+.4f} | √âcart F1: {row['f1_gap']:+.4f}")
    print("-" * 70)

# Comparaison des mod√®les

In [106]:
# Comparaison des mod√®les
results_df, detailed_reports = compare_models(X, y, test_size=0.15, cv_folds=5)

# Sauvegarde des r√©sultats
save_results(results_df, detailed_reports, output_prefix='classification_results')


COMPARAISON DES MOD√àLES DE CLASSIFICATION

>>> √âvaluation de DummyClassifier...
  - Train/Test split...
  - Validation crois√©e (5 folds)...
  ‚úì DummyClassifier termin√©

>>> √âvaluation de RandomForestClassifier...
  - Train/Test split...
  - Validation crois√©e (5 folds)...
  ‚úì RandomForestClassifier termin√©

>>> √âvaluation de XGBClassifier...
  - Train/Test split...
  - Validation crois√©e (5 folds)...
  ‚úì XGBClassifier termin√©

>>> √âvaluation de CatBoostClassifier...
  - Train/Test split...
  - Validation crois√©e (5 folds)...
  ‚úì CatBoostClassifier termin√©

‚úì R√©sum√© sauvegard√©: classification_results_summary.csv
‚úì R√©sultats par classe sauvegard√©s: classification_results_by_class.csv
‚úì Matrices de confusion sauvegard√©es: classification_results_confusion_matrices.csv
‚úì Analyse d'overfitting sauvegard√©e: classification_results_overfitting_analysis.csv

TOUS LES R√âSULTATS ONT √âT√â SAUVEGARD√âS

üìä R√âSUM√â DE L'OVERFITTING:
---------------------------

# Modele DUMMY
- DummyClassifier

 # Modele LINEAIRE

# Modele NON LINEAIRE
- RandomForest, XGBoost ou CatBoost
- M√©triques d‚Äô√©valuation en classification : matrice de confusion, rappel et pr√©cision.
- Scores (pr√©sence d‚Äôoverfit ou non, capacit√© d‚Äô√©viter les faux positifs ou faux n√©gatifs)

# Am√©lioration de la classification
- demandez-vous si √©viter des faux positifs est plus important qu‚Äô√©viter des faux n√©gatifs.

# OPTIMISATION DES HYPER-PARAMETRES

In [107]:
from sklearn.model_selection import StratifiedKFold
from datetime import time


def get_param_grids():
    """
    D√©finit les grilles de param√®tres pour chaque mod√®le.

    Returns:
        dict: Dictionnaire avec les grilles de param√®tres pour chaque mod√®le
    """
    param_grids = {
        'DummyClassifier': {
            'strategy': ['most_frequent', 'stratified', 'uniform']
        },

        'RandomForestClassifier': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15, None],
            'min_samples_split': [2, 10, 20],
            'min_samples_leaf': [1, 5, 10],
            'max_features': ['sqrt', 'log2']
        },

        'XGBClassifier': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.3],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'reg_alpha': [0, 0.1, 1],
            'reg_lambda': [1, 2, 5]
        },

        'CatBoostClassifier': {
            'iterations': [50, 100, 200],
            'depth': [4, 6, 8],
            'learning_rate': [0.01, 0.05, 0.1],
            'l2_leaf_reg': [1, 3, 5],
            'subsample': [0.8, 1.0]
        }
    }

    return param_grids


def get_small_param_grids():
    """
    Grilles r√©duites pour des tests rapides.

    Returns:
        dict: Grilles de param√®tres r√©duites
    """
    param_grids = {
        'DummyClassifier': {
            'strategy': ['most_frequent', 'stratified']
        },

        'RandomForestClassifier': {
            'n_estimators': [50, 100],
            'max_depth': [5, 10, None],
            'min_samples_split': [2, 10],
            'max_features': ['sqrt']
        },

        'XGBClassifier': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8, 1.0],
            'reg_lambda': [1, 2]
        },

        'CatBoostClassifier': {
            'iterations': [50, 100],
            'depth': [4, 6],
            'learning_rate': [0.01, 0.1],
            'l2_leaf_reg': [1, 3]
        }
    }

    return param_grids


def perform_grid_search(model, param_grid, X_train, y_train, model_name,
                       cv=5, scoring='f1_macro', n_jobs=-1):
    """
    Effectue un GridSearchCV pour un mod√®le donn√©.

    Args:
        model: Le mod√®le √† optimiser
        param_grid: Grille de param√®tres
        X_train, y_train: Donn√©es d'entra√Ænement
        model_name: Nom du mod√®le
        cv: Nombre de folds pour la validation crois√©e
        scoring: M√©trique d'optimisation
        n_jobs: Nombre de processus parall√®les

    Returns:
        tuple: (GridSearchCV object, r√©sultats dict)
    """
    print(f"\n{'='*70}")
    print(f"üîç Grid Search pour {model_name}")
    print(f"{'='*70}")
    print(f"Nombre de combinaisons √† tester: {np.prod([len(v) for v in param_grid.values()])}")
    print(f"M√©trique d'optimisation: {scoring}")
    print(f"Validation crois√©e: {cv} folds")

    start_time = time.time()

    # Configuration de la validation crois√©e stratifi√©e
    cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv_strategy,
        scoring=scoring,
        n_jobs=n_jobs,
        verbose=0,
        return_train_score=True
    )

    # Entra√Ænement
    grid_search.fit(X_train, y_train)

    elapsed_time = time.time() - start_time

    # R√©sultats
    results = {
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': str(grid_search.best_params_),
        'n_combinations': len(grid_search.cv_results_['params']),
        'time_seconds': elapsed_time
    }

    print(f"‚úì Termin√© en {elapsed_time:.2f} secondes")
    print(f"üìä Meilleur score ({scoring}): {grid_search.best_score_:.4f}")
    print(f"üèÜ Meilleurs param√®tres:")
    for param, value in grid_search.best_params_.items():
        print(f"   - {param}: {value}")

    return grid_search, results


def evaluate_best_model(grid_search, X_train, X_test, y_train, y_test, model_name):
    """
    √âvalue le meilleur mod√®le trouv√© par GridSearch sur le jeu de test.

    Args:
        grid_search: Objet GridSearchCV entra√Æn√©
        X_train, X_test: Features
        y_train, y_test: Labels
        model_name: Nom du mod√®le

    Returns:
        dict: R√©sultats d'√©valuation
    """
    best_model = grid_search.best_estimator_

    # Pr√©dictions
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)

    # Rapports
    report_train = classification_report(y_train, y_pred_train, output_dict=True)
    report_test = classification_report(y_test, y_pred_test, output_dict=True)

    # Matrice de confusion
    cm = confusion_matrix(y_test, y_pred_test)

    # Calcul des √©carts (overfitting)
    accuracy_gap = report_train['accuracy'] - report_test['accuracy']
    f1_gap = report_train['macro avg']['f1-score'] - report_test['macro avg']['f1-score']
    overfitting = 'OUI' if (accuracy_gap > 0.05 or f1_gap > 0.05) else 'NON'

    results = {
        'model': model_name,
        'train_accuracy': report_train['accuracy'],
        'test_accuracy': report_test['accuracy'],
        'train_f1_macro': report_train['macro avg']['f1-score'],
        'test_f1_macro': report_test['macro avg']['f1-score'],
        'train_precision': report_train['macro avg']['precision'],
        'test_precision': report_test['macro avg']['precision'],
        'train_recall': report_train['macro avg']['recall'],
        'test_recall': report_test['macro avg']['recall'],
        'accuracy_gap': accuracy_gap,
        'f1_gap': f1_gap,
        'overfitting': overfitting,
        'confusion_matrix': str(cm.tolist())
    }

    print(f"\nüìà √âvaluation sur le jeu de test:")
    print(f"   Train accuracy: {report_train['accuracy']:.4f}")
    print(f"   Test accuracy:  {report_test['accuracy']:.4f}")
    print(f"   √âcart accuracy: {accuracy_gap:+.4f}")
    print(f"   Overfitting:    {overfitting}")

    return results, report_test, cm


def compare_models_gridsearch(X, y, param_grids='full', test_size=0.2,
                              cv=5, scoring='f1_macro', random_state=42):
    """
    Compare tous les mod√®les avec GridSearchCV.

    Args:
        X: Features
        y: Labels
        param_grids: 'full', 'small', ou dict personnalis√©
        test_size: Proportion du jeu de test
        cv: Nombre de folds
        scoring: M√©trique d'optimisation
        random_state: Seed

    Returns:
        tuple: (r√©sultats_grid, r√©sultats_eval, grid_objects)
    """
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print(f"\n{'='*70}")
    print(f"GRIDSEARCH - COMPARAISON DES MOD√àLES")
    print(f"{'='*70}")
    print(f"Taille du dataset: {len(X)} √©chantillons")
    print(f"Train: {len(X_train)} | Test: {len(X_test)}")
    print(f"Distribution des classes: {dict(pd.Series(y).value_counts())}")

    # S√©lection de la grille
    if param_grids == 'full':
        grids = get_param_grids()
    elif param_grids == 'small':
        grids = get_small_param_grids()
    else:
        grids = param_grids

    # Mod√®les de base
    base_models = {
        'DummyClassifier': DummyClassifier(random_state=random_state),
        'RandomForestClassifier': RandomForestClassifier(random_state=random_state),
        'XGBClassifier': XGBClassifier(random_state=random_state, eval_metric='logloss'),
        'CatBoostClassifier': CatBoostClassifier(random_state=random_state, verbose=0)
    }

    grid_results = []
    eval_results = []
    grid_objects = {}
    detailed_reports = []

    # GridSearch pour chaque mod√®le
    for model_name, base_model in base_models.items():
        param_grid = grids[model_name]

        # GridSearch
        grid_search, grid_res = perform_grid_search(
            base_model, param_grid, X_train, y_train,
            model_name, cv=cv, scoring=scoring
        )
        grid_results.append(grid_res)
        grid_objects[model_name] = grid_search

        # √âvaluation du meilleur mod√®le
        eval_res, report, cm = evaluate_best_model(
            grid_search, X_train, X_test, y_train, y_test, model_name
        )
        eval_results.append(eval_res)

        detailed_reports.append({
            'model': model_name,
            'report': report,
            'confusion_matrix': cm
        })

    # Conversion en DataFrames
    grid_df = pd.DataFrame(grid_results)
    eval_df = pd.DataFrame(eval_results)

    return grid_df, eval_df, grid_objects, detailed_reports


def save_gridsearch_results(grid_df, eval_df, grid_objects, detailed_reports,
                            output_prefix='gridsearch_results'):
    """
    Sauvegarde tous les r√©sultats du GridSearch.

    Args:
        grid_df: DataFrame avec r√©sultats du GridSearch
        eval_df: DataFrame avec √©valuation finale
        grid_objects: Dict des objets GridSearchCV
        detailed_reports: Rapports d√©taill√©s
        output_prefix: Pr√©fixe des fichiers
    """
    # 1. R√©sum√© du GridSearch
    grid_df.to_csv(f'{output_prefix}_grid_summary.csv', index=False)
    print(f"\n‚úì R√©sum√© GridSearch: {output_prefix}_grid_summary.csv")

    # 2. √âvaluation finale
    eval_df.to_csv(f'{output_prefix}_evaluation.csv', index=False)
    print(f"‚úì √âvaluation finale: {output_prefix}_evaluation.csv")

    # 3. D√©tails complets du GridSearch (tous les r√©sultats)
    all_cv_results = []
    for model_name, grid_search in grid_objects.items():
        cv_res = pd.DataFrame(grid_search.cv_results_)
        cv_res.insert(0, 'model', model_name)
        all_cv_results.append(cv_res)

    full_cv_df = pd.concat(all_cv_results, ignore_index=True)
    full_cv_df.to_csv(f'{output_prefix}_full_cv_results.csv', index=False)
    print(f"‚úì R√©sultats CV complets: {output_prefix}_full_cv_results.csv")

    # 4. Meilleurs param√®tres
    best_params_data = []
    for model_name, grid_search in grid_objects.items():
        for param, value in grid_search.best_params_.items():
            best_params_data.append({
                'model': model_name,
                'parameter': param,
                'value': str(value)
            })

    best_params_df = pd.DataFrame(best_params_data)
    best_params_df.to_csv(f'{output_prefix}_best_params.csv', index=False)
    print(f"‚úì Meilleurs param√®tres: {output_prefix}_best_params.csv")

    # 5. Matrices de confusion
    cm_data = []
    for item in detailed_reports:
        cm_data.append({
            'model': item['model'],
            'confusion_matrix': str(item['confusion_matrix'].tolist())
        })
    cm_df = pd.DataFrame(cm_data)
    cm_df.to_csv(f'{output_prefix}_confusion_matrices.csv', index=False)
    print(f"‚úì Matrices de confusion: {output_prefix}_confusion_matrices.csv")

    # 6. Analyse d'overfitting
    overfitting_cols = ['model', 'train_accuracy', 'test_accuracy', 'accuracy_gap',
                       'train_f1_macro', 'test_f1_macro', 'f1_gap', 'overfitting']
    overfitting_df = eval_df[overfitting_cols]
    overfitting_df.to_csv(f'{output_prefix}_overfitting.csv', index=False)
    print(f"‚úì Analyse overfitting: {output_prefix}_overfitting.csv")

    print(f"\n{'='*70}")
    print("TOUS LES R√âSULTATS ONT √âT√â SAUVEGARD√âS")
    print(f"{'='*70}")

    # Affichage du classement
    print(f"\nüèÜ CLASSEMENT DES MOD√àLES (par {grid_objects[list(grid_objects.keys())[0]].scoring}):")
    print("-" * 70)
    ranking = grid_df.sort_values('best_score', ascending=False)
    for i, row in ranking.iterrows():
        print(f"{i+1}. {row['model']:25s} - Score: {row['best_score']:.4f} - Temps: {row['time_seconds']:.1f}s")

    print("\nüìä D√âTECTION D'OVERFITTING:")
    print("-" * 70)
    for _, row in eval_df.iterrows():
        status = "‚ö†Ô∏è  OVERFITTING" if row['overfitting'] == 'OUI' else "‚úì  Pas d'overfitting"
        print(f"{row['model']:25s}: {status} (√©cart: {row['accuracy_gap']:+.4f})")


# GridSearch avec grille r√©duite (rapide pour test)
# Utilisez param_grids='full' pour une recherche compl√®te
grid_df, eval_df, grid_objects, detailed_reports = compare_models_gridsearch(
    X, y,
    param_grids='small',  # 'small', 'full', ou dict personnalis√©
    test_size=0.2,
    cv=3,  # 3 pour test rapide, 5 recommand√©
    scoring='f1_macro',
    random_state=42
)

# Sauvegarde
save_gridsearch_results(
    grid_df, eval_df, grid_objects, detailed_reports,
    output_prefix='gridsearch_results'
)

print("\n" + "="*70)
print("APER√áU DES MEILLEURS R√âSULTATS")
print("="*70)
print(grid_df.round(4).to_string(index=False))


GRIDSEARCH - COMPARAISON DES MOD√àLES
Taille du dataset: 1470 √©chantillons
Train: 1176 | Test: 294
Distribution des classes: {False: np.int64(1233), True: np.int64(237)}

üîç Grid Search pour DummyClassifier
Nombre de combinaisons √† tester: 2
M√©trique d'optimisation: f1_macro
Validation crois√©e: 3 folds
‚úì Termin√© en 1.07 secondes
üìä Meilleur score (f1_macro): 0.4853
üèÜ Meilleurs param√®tres:
   - strategy: stratified

üìà √âvaluation sur le jeu de test:
   Train accuracy: 0.7168
   Test accuracy:  0.7143
   √âcart accuracy: +0.0026
   Overfitting:    NON

üîç Grid Search pour RandomForestClassifier
Nombre de combinaisons √† tester: 12
M√©trique d'optimisation: f1_macro
Validation crois√©e: 3 folds
‚úì Termin√© en 1.16 secondes
üìä Meilleur score (f1_macro): 0.6145
üèÜ Meilleurs param√®tres:
   - max_depth: 10
   - max_features: sqrt
   - min_samples_split: 2
   - n_estimators: 100

üìà √âvaluation sur le jeu de test:
   Train accuracy: 0.9379
   Test accuracy:  0.8367