# Préprocessing Baseline (minimal + pipeline-ready)
**Objectif** : corrections déterministes + export datasets "base".
Tout ce qui dépend des stats (médiane/mode, etc.) sera fait dans le pipeline sklearn.

In [1]:

from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)


# Racine du projet
CWD = Path.cwd()
PROJECT_ROOT = CWD.parent.parent 

DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
TRAIN_PATH = DATA_RAW / "application_train.csv"
TEST_PATH  = DATA_RAW / "application_test.csv"

print("TRAIN_PATH =", TRAIN_PATH)
print("TEST_PATH  =", TEST_PATH)

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

print(train.shape, test.shape)

TRAIN_PATH = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\raw\application_train.csv
TEST_PATH  = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\raw\application_test.csv
(307511, 122) (48744, 121)


## Fonctions utilitaires “déterministes

### Détection colonnes binaires + mapping

In [2]:
def find_binary_columns(df: pd.DataFrame):
    # Y/N binaires
    binary_yn = [
        c for c in df.columns
        if df[c].dtype == "object"
        and set(df[c].dropna().unique()) <= {"Y", "N"}
    ]

    # 0/1 binaires (attention: exclure TARGET plus tard)
    binary_01 = [
        c for c in df.columns
        if df[c].nunique(dropna=True) == 2
        and set(df[c].dropna().unique()) <= {0, 1}
    ]
    return binary_yn, binary_01


def apply_binary_mapping(train: pd.DataFrame, test: pd.DataFrame, target_col="TARGET"):
    train = train.copy()
    test = test.copy()

    bin_yn, bin_01 = find_binary_columns(train)
    bin_01 = [c for c in bin_01 if c != target_col]

    # Y/N -> bool
    for c in bin_yn:
        train[c] = train[c].map({"Y": True, "N": False})
        test[c]  = test[c].map({"Y": True, "N": False})

    # 0/1 -> bool
    for c in bin_01:
        # safe cast si jamais c'est float avec 0/1
        train[c] = train[c].astype("Int64").astype("boolean")
        test[c]  = test[c].astype("Int64").astype("boolean")

    return train, test, {"binary_yn": bin_yn, "binary_01": bin_01}


train, test, bin_info = apply_binary_mapping(train, test)
print("Y/N:", len(bin_info["binary_yn"]), " | 0/1:", len(bin_info["binary_01"]))

Y/N: 2  | 0/1: 32


## Correction des sentinelles

In [3]:
def fix_sentinels(train: pd.DataFrame, test: pd.DataFrame):
    train = train.copy()
    test = test.copy()

    # 1) DAYS_EMPLOYED sentinel
    if "DAYS_EMPLOYED" in train.columns:
        sentinel = 365243

        train["FLAG_DAYS_EMPLOYED_SENTINEL"] = (train["DAYS_EMPLOYED"] == sentinel).astype("int8")
        test["FLAG_DAYS_EMPLOYED_SENTINEL"]  = (test["DAYS_EMPLOYED"] == sentinel).astype("int8")

        train.loc[train["DAYS_EMPLOYED"] == sentinel, "DAYS_EMPLOYED"] = np.nan
        test.loc[test["DAYS_EMPLOYED"] == sentinel, "DAYS_EMPLOYED"] = np.nan

    # 2) Catégorielles "XNA" / "XAP" (souvent = manquant codé)
    cat_cols = train.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        train[c] = train[c].replace({"XNA": np.nan, "XAP": np.nan})
        test[c]  = test[c].replace({"XNA": np.nan, "XAP": np.nan})

    return train, test


train, test = fix_sentinels(train, test)

In [4]:
## Controle alignement train/test

In [5]:

TARGET_COL = "TARGET"

train_cols = set(train.columns)
test_cols = set(test.columns)

# test n'a pas TARGET
missing_in_test = sorted(list(train_cols - test_cols - {TARGET_COL}))
extra_in_test = sorted(list(test_cols - train_cols))

print("Colonnes train absentes test:", len(missing_in_test))
print("Colonnes test absentes train:", len(extra_in_test))

# normalement 0 et 0 (hors TARGET)
if missing_in_test:
    print("Exemples missing_in_test:", missing_in_test[:10])
if extra_in_test:
    print("Exemples extra_in_test:", extra_in_test[:10])

Colonnes train absentes test: 0
Colonnes test absentes train: 0


## Export

In [6]:

OUT_TRAIN = DATA_PROCESSED / "application_train_base.csv"
OUT_TEST  = DATA_PROCESSED / "application_test_base.csv"

train.to_csv(OUT_TRAIN, index=False)
test.to_csv(OUT_TEST, index=False)

print("Saved:", OUT_TRAIN)
print("Saved:", OUT_TEST)

Saved: c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\application_train_base.csv
Saved: c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\application_test_base.csv
