## Imports

In [None]:
import pandas as pd

In [None]:
x_train_path = "data/X_train_Hi5.csv"
X_train = pd.read_csv(x_train_path)

## Remplacement des NaNs avec un imputer personnalisé

In [None]:
numerical_columns = X_train.select_dtypes(include='number').columns.tolist()

Ces colonnes numériques ne sont pas exhaustives, certaines colonnes ont des valeurs numériques mais sont de type obj à cause de quelques valeurs aberrantes, c'est à corriger.

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

# Example DataFrame
df = X_train.copy()

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class GroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, target_cols, global_strategy="mean", fallback_value=None):
        """
        Parameters:
        - group_col: Column to group by (e.g., 'department').
        - target_cols: List of columns to impute.
        - global_strategy: Strategy for fallback ("mean", "median", or None).
        - fallback_value: Default value to use if no global strategy is specified.
        """
        self.group_col = group_col
        self.target_cols = target_cols
        self.global_strategy = global_strategy
        self.fallback_value = fallback_value

    def fit(self, X, y=None):
        # Compute group means for each target column
        self.group_means_ = {
            col: X.groupby(self.group_col)[col].mean()
            for col in self.target_cols
        }

        # Compute global statistics for fallback
        if self.global_strategy == "mean":
            self.global_stats_ = X[self.target_cols].mean()
        elif self.global_strategy == "median":
            self.global_stats_ = X[self.target_cols].median()
        else:
            self.global_stats_ = {col: self.fallback_value for col in self.target_cols}
        
        return self

    def transform(self, X):
        X = X.copy()

        for col in self.target_cols:
            group_means = self.group_means_[col]
            global_stat = self.global_stats_[col]

            # Impute NaNs with group means where available
            X[col] = X.groupby(self.group_col)[col].transform(
                lambda grp: grp.fillna(group_means.get(grp.name, global_stat))
            )

            # Fallback for any remaining NaN values
            X[col].fillna(global_stat, inplace=True)

        return X

In [None]:
group_col = "piezo_station_department_code"
target_cols = numerical_columns

df = X_train.copy()

imputer = GroupMeanImputer(
    group_col=group_col,
    target_cols=target_cols,
    global_strategy=None,    # Use global mean as fallback (set to None if we want to use the fallback value directly)
    fallback_value=0         # Use 0 if no global mean is available
)

# Apply the imputer
imputed_df = imputer.fit_transform(df)

Notons qu'on peut mélanger cet Imputer avec des ColumnTransformer pour adopter différentes stratégies pour différentes colonnes.