# bureau

La table bureau décrit l’ensemble des crédits externes détenus par les clients auprès d’autres organismes. Après nettoyage, nous avons conservé les informations essentielles sur les montants, les dettes, les dates, les types de crédits et les statuts. L’agrégation s’effectue au niveau SK_ID_CURR, en calculant des statistiques globales (moyenne, somme, min, max) sur les montants et les durées, ainsi que des comptages de catégories (nombre de crédits actifs, fermés, en défaut). Des ratios comportementaux ont été ajoutés pour mesurer la proportion de crédits actifs, fermés ou problématiques. Des flags indiquent l’absence totale d’historique bureau pour certains clients. Cette vue consolidée résume le comportement global du client vis‑à‑vis de ses crédits externes.


In [5]:
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")

# 1) Position actuelle (dossier où tourne le notebook)
CWD = Path.cwd()

# 2) Remonter jusqu'à trouver "data" (robuste même si tu bouges le notebook)
PROJECT_ROOT = CWD.parent.parent 

print("CWD         =", CWD)
print("PROJECT_ROOT=", PROJECT_ROOT)

# 3) Dossiers data
DATA_RAW       = PROJECT_ROOT / "data" / "raw"
DATA_CLEAN     = PROJECT_ROOT / "data" / "clean"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

DATA_CLEAN.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("DATA_RAW      =", DATA_RAW)
print("DATA_PROCESSED=", DATA_PROCESSED)

CWD         = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\notebooks\01_data_preparation
PROJECT_ROOT= c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2
DATA_RAW      = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\raw
DATA_PROCESSED= c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed


## Chargement des tables

In [7]:
BUREAU_PATH   = DATA_RAW / "bureau.csv"
BB_FINAL_PATH = DATA_PROCESSED / "bureau_balance_agg.csv"

print("BUREAU_PATH  =", BUREAU_PATH)
print("BB_FINAL_PATH=", BB_FINAL_PATH)

bureau = pd.read_csv(BUREAU_PATH)
bureau_bal = pd.read_csv(BB_FINAL_PATH)

print("bureau:", bureau.shape)
print("bureau_balance_final:", bureau_bal.shape)
bureau_clean = bureau.copy()

BUREAU_PATH  = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\raw\bureau.csv
BB_FINAL_PATH= c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\bureau_balance_agg.csv
bureau: (1716428, 17)
bureau_balance_final: (817395, 26)


## Typages IDs

In [8]:
for c in ["SK_ID_CURR", "SK_ID_BUREAU"]:
    if c in bureau_clean.columns:
        bureau_clean[c] = pd.to_numeric(bureau_clean[c], errors="coerce").astype("Int64")


## Numériques

In [9]:
num_cols = [
    "DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "DAYS_CREDIT_ENDDATE",
    "DAYS_ENDDATE_FACT", "AMT_CREDIT_MAX_OVERDUE", "CNT_CREDIT_PROLONG",
    "AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT",
    "AMT_CREDIT_SUM_OVERDUE", "DAYS_CREDIT_UPDATE", "AMT_ANNUITY"
]
num_cols = [c for c in num_cols if c in bureau_clean.columns]

for col in num_cols:
    bureau_clean[col] = pd.to_numeric(bureau_clean[col], errors="coerce")

## Catégorielles

In [10]:

cat_cols = ["CREDIT_ACTIVE", "CREDIT_CURRENCY", "CREDIT_TYPE"]
cat_cols = [c for c in cat_cols if c in bureau_clean.columns]

for col in cat_cols:
    bureau_clean[col] = bureau_clean[col].astype("string").str.strip()

## Flag NA

In [11]:
for col in num_cols + cat_cols:
    bureau_clean[f"BUREAU_{col}_NA"] = bureau_clean[col].isna().astype("int8")

## Features métiers

In [12]:
if "AMT_CREDIT_SUM" in bureau_clean.columns and "AMT_CREDIT_SUM_DEBT" in bureau_clean.columns:
    bureau_clean["BUREAU_DEBT_RATIO"] = (
        bureau_clean["AMT_CREDIT_SUM_DEBT"] / bureau_clean["AMT_CREDIT_SUM"].replace(0, np.nan)
    )
else:
    bureau_clean["BUREAU_DEBT_RATIO"] = np.nan

if "AMT_CREDIT_SUM" in bureau_clean.columns and "AMT_CREDIT_SUM_OVERDUE" in bureau_clean.columns:
    bureau_clean["BUREAU_OVERDUE_RATIO"] = (
        bureau_clean["AMT_CREDIT_SUM_OVERDUE"] / bureau_clean["AMT_CREDIT_SUM"].replace(0, np.nan)
    )
else:
    bureau_clean["BUREAU_OVERDUE_RATIO"] = np.nan

if "DAYS_CREDIT_ENDDATE" in bureau_clean.columns and "DAYS_CREDIT" in bureau_clean.columns:
    bureau_clean["BUREAU_CREDIT_DURATION"] = bureau_clean["DAYS_CREDIT_ENDDATE"] - bureau_clean["DAYS_CREDIT"]
else:
    bureau_clean["BUREAU_CREDIT_DURATION"] = np.nan

if "DAYS_ENDDATE_FACT" in bureau_clean.columns and "DAYS_CREDIT_ENDDATE" in bureau_clean.columns:
    bureau_clean["BUREAU_REAL_DELAY"] = bureau_clean["DAYS_ENDDATE_FACT"] - bureau_clean["DAYS_CREDIT_ENDDATE"]
else:
    bureau_clean["BUREAU_REAL_DELAY"] = np.nan

## Flag comportementaux

In [13]:
if "CREDIT_ACTIVE" in bureau_clean.columns:
    bureau_clean["BUREAU_IS_ACTIVE"] = (bureau_clean["CREDIT_ACTIVE"] == "Active").astype("int8")
    bureau_clean["BUREAU_IS_CLOSED"] = (bureau_clean["CREDIT_ACTIVE"] == "Closed").astype("int8")
    bureau_clean["BUREAU_IS_BAD"]    = (bureau_clean["CREDIT_ACTIVE"] == "Bad debt").astype("int8")
else:
    bureau_clean["BUREAU_IS_ACTIVE"] = 0
    bureau_clean["BUREAU_IS_CLOSED"] = 0
    bureau_clean["BUREAU_IS_BAD"]    = 0

print("bureau_clean:", bureau_clean.shape)

bureau_clean: (1716428, 39)


## Merge bureau_balance (sur SK_ID_BUREAU)

In [None]:


bureau_merged = bureau_clean.merge(bureau_bal, on="SK_ID_BUREAU", how="left")


bb_cols = [c for c in bureau_bal.columns if c != "SK_ID_BUREAU"]
bureau_merged["BB_NO_RECORDS"] = bureau_merged[bb_cols].isna().all(axis=1).astype("int8")

print("bureau_merged:", bureau_merged.shape)



bureau_merged: (1716428, 65)


## Aggrégations numériques par SK_ID_CURR

In [15]:
exclude = {"SK_ID_CURR", "SK_ID_BUREAU", "TARGET"}

num_cols_all = bureau_merged.select_dtypes(include=["number"]).columns
num_cols_all = [c for c in num_cols_all if c not in exclude]

agg_funcs = ["mean", "min", "max", "sum", "std"]
bureau_num_agg = bureau_merged.groupby("SK_ID_CURR")[num_cols_all].agg(agg_funcs)
bureau_num_agg.columns = [f"BUREAU_{col}_{stat}".upper() for col, stat in bureau_num_agg.columns]

## Dummies catégorielles

In [16]:
cat_cols = ["CREDIT_ACTIVE", "CREDIT_CURRENCY", "CREDIT_TYPE"]
cat_cols = [c for c in cat_cols if c in bureau_merged.columns]

bureau_cat_dummies = pd.get_dummies(
    bureau_merged[cat_cols],
    prefix=[f"BUREAU_{c}" for c in cat_cols],
    dummy_na=True
).astype("int8")

bureau_cat_dummies["SK_ID_CURR"] = bureau_merged["SK_ID_CURR"]
bureau_cat_agg = bureau_cat_dummies.groupby("SK_ID_CURR").sum()


## Ratio + counts

In [17]:

bureau_ratios = bureau_merged.groupby("SK_ID_CURR").agg(
    BUREAU_RATIO_ACTIVE=("BUREAU_IS_ACTIVE", "mean"),
    BUREAU_RATIO_CLOSED=("BUREAU_IS_CLOSED", "mean"),
    BUREAU_RATIO_BAD=("BUREAU_IS_BAD", "mean"),
    BUREAU_RATIO_BB_NO_RECORDS=("BB_NO_RECORDS", "mean"),
)

bureau_counts = bureau_merged.groupby("SK_ID_CURR").agg(
    BUREAU_COUNT_CREDITS=("SK_ID_BUREAU", "count")
)

## Dataset final + exports

In [18]:
bureau_final = (
    bureau_num_agg
    .join(bureau_cat_agg)
    .join(bureau_ratios)
    .join(bureau_counts)
).reset_index()

print("bureau_final:", bureau_final.shape)


OUT_PATH = DATA_PROCESSED / "bureau_final.csv"
bureau_final.to_csv(OUT_PATH, index=False)

print("Export:", OUT_PATH)

bureau_final: (305811, 332)
Export: c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\bureau_final.csv
