In [1]:

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os

In [2]:
# src/preprocessing.py


def preprocess_data(
    input_file="../data/raw/retail_customers_COMPLETE_CATEGORICAL.csv",
    output_dir="data/train_test/"
):
    """
    Full preprocessing pipeline for retail customer dataset.

    Inputs:
        input_file: str, path to raw CSV file
        output_dir: str, path to save processed datasets and transformers
    Outputs:
        X_train, X_test, y_train, y_test: processed datasets
    """

    # -------------------------
    # 1. Load data
    # -------------------------
    df = pd.read_csv(input_file, encoding='utf-8')

    # -------------------------
    # 2. Feature Engineering
    # -------------------------
    df["MonetaryPerDay"] = df["MonetaryTotal"] / (df["Recency"] + 1)
    df["AvgBasketValue"] = df["MonetaryTotal"] / (df["Frequency"] + 1)
    df["TenureRatio"] = df["Recency"] / (df["CustomerTenureDays"] + 1)

    # -------------------------
    # 3. Parsing Dates
    # -------------------------
    df["RegistrationDate"] = pd.to_datetime(
        df["RegistrationDate"], dayfirst=True, errors="coerce"
    )
    df["RegYear"] = df["RegistrationDate"].dt.year
    df["RegMonth"] = df["RegistrationDate"].dt.month
    df["RegDay"] = df["RegistrationDate"].dt.day
    df["RegWeekday"] = df["RegistrationDate"].dt.weekday

    # -------------------------
    # 4. Handle missing values
    # -------------------------
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns

    num_imputer = SimpleImputer(strategy="median")
    cat_imputer = SimpleImputer(strategy="most_frequent")

    df[num_cols] = num_imputer.fit_transform(df[num_cols])
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

    # -------------------------
    # 5. Encode categorical variables
    # -------------------------
    ordinal_map = {
        "AgeCategory": {"18-24":1, "25-34":2, "35-44":3, "45-54":4, "55-64":5, "65+":6, "Inconnu":0},
        "SpendingCategory": {"Low":1, "Medium":2, "High":3, "VIP":4},
        "BasketSizeCategory": {"Petit":1, "Moyen":2, "Grand":3, "Inconnu":0},
        "LoyaltyLevel": {"Nouveau":1, "Jeune":2, "Etabli":3, "Ancien":4, "Inconnu":0},
        "ChurnRiskCategory": {"Faible":1, "Moyen":2, "Élevé":3, "Critique":4}
    }

    for col, mapping in ordinal_map.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)

    one_hot_cols = [
        "RFMSegment", "CustomerType", "FavoriteSeason", "PreferredTimeOfDay", 
        "Region", "WeekendPreference", "ProductDiversity", "Gender", "AccountStatus"
    ]
    df = pd.get_dummies(df, columns=[c for c in one_hot_cols if c in df.columns], drop_first=True)

    # -------------------------
    # 6. Split features and target
    # -------------------------
    target_col = "Churn"
    X = df.drop(columns=[target_col, "NewsletterSubscribed", "LastLoginIP", "RegistrationDate"])
    y = df[target_col]

    # -------------------------
    # 7. Train/Test Split
    # -------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # -------------------------
    # 8. Feature Scaling (prevent data leakage)
    # -------------------------
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    # -------------------------
    # 9. Save processed datasets and transformers
    # -------------------------
    os.makedirs(output_dir, exist_ok=True)

    X_train.to_csv(os.path.join(output_dir, "X_train.csv"), index=False)
    X_test.to_csv(os.path.join(output_dir, "X_test.csv"), index=False)
    y_train.to_csv(os.path.join(output_dir, "y_train.csv"), index=False)
    y_test.to_csv(os.path.join(output_dir, "y_test.csv"), index=False)

    joblib.dump(scaler, os.path.join(output_dir, "scaler.joblib"))
    joblib.dump(num_imputer, os.path.join(output_dir, "num_imputer.joblib"))
    joblib.dump(cat_imputer, os.path.join(output_dir, "cat_imputer.joblib"))

    print(f"Preprocessing complete! Train/Test sets saved in {output_dir}")
    return X_train, X_test, y_train, y_test


if __name__ == "__main__":
    preprocess_data()


  df["RegistrationDate"] = pd.to_datetime(
See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = df.select_dtypes(include=["object"]).columns


Preprocessing complete! Train/Test sets saved in data/train_test/
