In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

def encode_categorical_columns(df_, onehot_threshold=10, label_threshold=50):
    """
    Автоматичне кодування категоріальних фіч.
    Використовує:
    - OneHotEncoder, якщо кількість категорій <= onehot_threshold
    - LabelEncoder, якщо між onehot_threshold та label_threshold
    - Frequency Encoding, якщо > label_threshold
    """

    df_processed = df_.copy()
    decisions = {}
    encoders = {}

    categorical_cols = df_processed.select_dtypes(exclude=['number']).columns

    for col in categorical_cols:
        unique_count = df_processed[col].nunique()
        n_rows = len(df_processed)
        ratio = unique_count / n_rows

        # ==== 1. One-Hot Encoding ====
        if unique_count <= onehot_threshold:
            ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            transformed = ohe.fit_transform(df_processed[[col]])
            new_cols = [f"{col}_{cat}" for cat in ohe.categories_[0]]
            df_encoded = pd.DataFrame(transformed, columns=new_cols, index=df_processed.index)
            df_processed = pd.concat([df_processed.drop(columns=[col]), df_encoded], axis=1)
            decisions[col] = f"One-Hot Encoding ({unique_count} categories)"
            encoders[col] = ohe

        # ==== 2. Label Encoding ====
        elif unique_count <= label_threshold:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col].astype(str))
            decisions[col] = f"Label Encoding ({unique_count} categories)"
            encoders[col] = le

        # ==== 3. Frequency Encoding ====
        else:
            freq = df_processed[col].value_counts(normalize=True)
            df_processed[col] = df_processed[col].map(freq)
            decisions[col] = f"Frequency Encoding ({unique_count} categories)"
            encoders[col] = freq

    return df_processed, decisions, encoders


In [None]:
df_encoded, encoding_decisions, encoders = encode_categorical_columns(df)
print(pd.DataFrame(list(encoding_decisions.items()), columns=['Column', 'Encoding type']))
