# Fusion des tables + split stratifié (train/valid/test) + exports
 Objectif :
 - merger application_train/test clean avec bureau_final et previous_final
 - exporter train_final / test_final
 - faire un split stratifié sur TARGET en 3 jeux (train/valid/test)
 - exporter train_split / valid_split / test_split

In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")

In [2]:
CWD = Path.cwd()

PROJECT_ROOT = CWD.parent.parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_CLEAN = PROJECT_ROOT / "data" / "clean"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("CWD          =", CWD)
print("PROJECT_ROOT =", PROJECT_ROOT)
print("DATA_CLEAN   =", DATA_CLEAN)
print("DATA_PROCESSED =", DATA_PROCESSED)

CWD          = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\notebooks\01_data_preparation
PROJECT_ROOT = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2
DATA_CLEAN   = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\clean
DATA_PROCESSED = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed


In [4]:

TRAIN_CLEAN_PATH = DATA_PROCESSED/ "application_train_base.csv"
TEST_CLEAN_PATH  = DATA_PROCESSED / "application_test_base.csv"

BUREAU_FINAL_PATH = DATA_PROCESSED / "bureau_final.csv"
PREV_FINAL_PATH   = DATA_PROCESSED / "previous_final.csv"

train = pd.read_csv(TRAIN_CLEAN_PATH)
test  = pd.read_csv(TEST_CLEAN_PATH)
bureau = pd.read_csv(BUREAU_FINAL_PATH)
prev_app = pd.read_csv(PREV_FINAL_PATH)
print("train:", train.shape)
print("test :", test.shape)
print("bureau:", bureau.shape)
print("prev_app:", prev_app.shape)

train: (307511, 123)
test : (48744, 122)
bureau: (305811, 332)
prev_app: (338857, 1205)


In [5]:
train_enriched = (
    train
    .merge(bureau, on="SK_ID_CURR", how="left")
    .merge(prev_app, on="SK_ID_CURR", how="left")
)

test_enriched = (
    test
    .merge(bureau, on="SK_ID_CURR", how="left")
    .merge(prev_app, on="SK_ID_CURR", how="left")
)

print("train_enriched:", train_enriched.shape)
print("test_enriched :", test_enriched.shape)

train_enriched: (307511, 1658)
test_enriched : (48744, 1657)


In [6]:

TRAIN_FINAL_PATH = DATA_PROCESSED / "train_final.csv"
TEST_FINAL_PATH  = DATA_PROCESSED / "test_final.csv"

train_enriched.to_csv(TRAIN_FINAL_PATH, index=False)
test_enriched.to_csv(TEST_FINAL_PATH, index=False)

print("Export OK :")
print(" -", TRAIN_FINAL_PATH)
print(" -", TEST_FINAL_PATH)

Export OK :
 - c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\train_final.csv
 - c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\test_final.csv


### Separation en 3 jeux stratifiés

In [7]:

def split_train_valid_test_stratified(
    df: pd.DataFrame,
    target_col: str = "TARGET",
    test_size: float = 0.15,
    valid_size: float = 0.15,
    random_state: int = 42
):
    if target_col not in df.columns:
        raise ValueError(f"Colonne cible '{target_col}' absente du DataFrame.")


    df_train_temp, df_test = train_test_split(
        df,
        test_size=test_size,
        stratify=df[target_col],
        random_state=random_state
    )


    valid_ratio_adjusted = valid_size / (1.0 - test_size)

    df_train, df_valid = train_test_split(
        df_train_temp,
        test_size=valid_ratio_adjusted,
        stratify=df_train_temp[target_col],
        random_state=random_state
    )

    return df_train, df_valid, df_test


df_train, df_valid, df_test = split_train_valid_test_stratified(
    train_enriched,
    target_col="TARGET",
    test_size=0.15,
    valid_size=0.15,
    random_state=42
)

print("Splits :")
print("TRAIN :", df_train.shape)
print("VALID :", df_valid.shape)
print("TEST  :", df_test.shape)

print("\nTaux de défaut (TARGET=1) :")
print("TRAIN :", df_train["TARGET"].mean())
print("VALID :", df_valid["TARGET"].mean())
print("TEST  :", df_test["TARGET"].mean())



Splits :
TRAIN : (215257, 1658)
VALID : (46127, 1658)
TEST  : (46127, 1658)

Taux de défaut (TARGET=1) :
TRAIN : 0.08072675917624049
VALID : 0.08073362672621241
TEST  : 0.08073362672621241


In [8]:
TRAIN_SPLIT_PATH = DATA_PROCESSED / "train_split.csv"
VALID_SPLIT_PATH = DATA_PROCESSED / "valid_split.csv"
TEST_SPLIT_PATH  = DATA_PROCESSED / "test_split.csv"

df_train.to_csv(TRAIN_SPLIT_PATH, index=False)
df_valid.to_csv(VALID_SPLIT_PATH, index=False)
df_test.to_csv(TEST_SPLIT_PATH, index=False)

print("\nExports splits OK :")
print(" -", TRAIN_SPLIT_PATH)
print(" -", VALID_SPLIT_PATH)
print(" -", TEST_SPLIT_PATH)


Exports splits OK :
 - c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\train_split.csv
 - c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\valid_split.csv
 - c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\test_split.csv
