# POS CASH Balanced

Cette table décrit l’historique des crédits POS (achats en plusieurs fois). Nous avons agrégé les informations au niveau SK_ID_CURR, en calculant le nombre de cycles, les retards moyens et maximum, le nombre de mois actifs et la fréquence des statuts. Ces indicateurs permettent d’évaluer la régularité des paiements POS. Un flag POS_NO_RECORDS indique les clients sans historique POS. L’ensemble fournit une vue synthétique du comportement du client sur les crédits à court terme.


In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
import warnings

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:

CWD = Path.cwd()

PROJECT_ROOT = CWD.parent.parent 

DATA_RAW       = PROJECT_ROOT / "data" / "raw"
DATA_CLEAN     = PROJECT_ROOT / "data" / "clean"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

DATA_CLEAN.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("DATA_RAW       =", DATA_RAW)
print("DATA_PROCESSED =", DATA_PROCESSED)


POS_PATH = DATA_RAW / "POS_CASH_balance.csv"
assert POS_PATH.exists(), f"Fichier introuvable : {POS_PATH}"

pos = pd.read_csv(POS_PATH)
print("POS_CASH_balance:", pos.shape)
pos.head()

DATA_RAW       = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\raw
DATA_PROCESSED = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed
POS_CASH_balance: (10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [3]:

pos.info()

pos_clean = pos.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   SK_ID_CURR             int64  
 2   MONTHS_BALANCE         int64  
 3   CNT_INSTALMENT         float64
 4   CNT_INSTALMENT_FUTURE  float64
 5   NAME_CONTRACT_STATUS   object 
 6   SK_DPD                 int64  
 7   SK_DPD_DEF             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 610.4+ MB


## Typage propre

In [4]:
pos_clean["SK_ID_PREV"] = pd.to_numeric(pos_clean["SK_ID_PREV"], errors="coerce").astype("Int64")
pos_clean["SK_ID_CURR"] = pd.to_numeric(pos_clean["SK_ID_CURR"], errors="coerce").astype("Int64")
pos_clean["MONTHS_BALANCE"] = pd.to_numeric(pos_clean["MONTHS_BALANCE"], errors="coerce").astype("Int64")

# Numériques utiles (robuste)
pos_clean["CNT_INSTALMENT"] = pd.to_numeric(pos_clean["CNT_INSTALMENT"], errors="coerce")
pos_clean["CNT_INSTALMENT_FUTURE"] = pd.to_numeric(pos_clean["CNT_INSTALMENT_FUTURE"], errors="coerce")
pos_clean["SK_DPD"] = pd.to_numeric(pos_clean["SK_DPD"], errors="coerce")
pos_clean["SK_DPD_DEF"] = pd.to_numeric(pos_clean["SK_DPD_DEF"], errors="coerce")

## Nettoyage catégorielle

In [5]:
pos_clean["NAME_CONTRACT_STATUS"] = (
    pos_clean["NAME_CONTRACT_STATUS"]
    .astype("string")
    .str.strip()
)

pos_clean["NAME_CONTRACT_STATUS"] = pos_clean["NAME_CONTRACT_STATUS"].replace({"XNA": pd.NA})

## Flags NA

In [6]:

pos_clean["POS_CNT_INSTALMENT_NA"] = pos_clean["CNT_INSTALMENT"].isna().astype(int)
pos_clean["POS_CNT_INSTALMENT_FUTURE_NA"] = pos_clean["CNT_INSTALMENT_FUTURE"].isna().astype(int)
pos_clean["POS_STATUS_NA"] = pos_clean["NAME_CONTRACT_STATUS"].isna().astype(int)
pos_clean["POS_SK_DPD_NA"] = pos_clean["SK_DPD"].isna().astype(int)
pos_clean["POS_SK_DPD_DEF_NA"] = pos_clean["SK_DPD_DEF"].isna().astype(int)

pos_clean.shape


(10001358, 13)

## Agrégations numériques

In [7]:

pos_num = pos_clean.groupby("SK_ID_PREV").agg(
    POS_MONTHS_MIN=("MONTHS_BALANCE", "min"),
    POS_MONTHS_MAX=("MONTHS_BALANCE", "max"),
    POS_MONTHS_MEAN=("MONTHS_BALANCE", "mean"),
    POS_MONTHS_COUNT=("MONTHS_BALANCE", "count"),

    POS_INSTALMENT_MEAN=("CNT_INSTALMENT", "mean"),
    POS_INSTALMENT_MAX=("CNT_INSTALMENT", "max"),
    POS_INSTALMENT_FUTURE_MEAN=("CNT_INSTALMENT_FUTURE", "mean"),
    POS_INSTALMENT_FUTURE_MAX=("CNT_INSTALMENT_FUTURE", "max"),

    POS_SK_DPD_MEAN=("SK_DPD", "mean"),
    POS_SK_DPD_MAX=("SK_DPD", "max"),
    POS_SK_DPD_DEF_MEAN=("SK_DPD_DEF", "mean"),
    POS_SK_DPD_DEF_MAX=("SK_DPD_DEF", "max"),

    # Bonus robustes (souvent utiles)
    POS_SK_DPD_SUM=("SK_DPD", "sum"),
    POS_SK_DPD_DEF_SUM=("SK_DPD_DEF", "sum")
)

## Comptage des status

In [8]:
pos_status_dummies = pd.get_dummies(
    pos_clean["NAME_CONTRACT_STATUS"],
    prefix="POS_STATUS",
    dummy_na=False
)

pos_status_dummies["SK_ID_PREV"] = pos_clean["SK_ID_PREV"]

pos_status = pos_status_dummies.groupby("SK_ID_PREV").sum()

## Ratios

In [9]:
denom = pos_num["POS_MONTHS_COUNT"].replace(0, np.nan)

for col in pos_status.columns:
    pos_status[col + "_RATIO"] = pos_status[col] / denom

In [10]:

pos_prev_agg = pos_num.join(pos_status)

pos_prev_agg["POS_NO_RECORDS"] = pos_prev_agg.isna().all(axis=1).astype(int)
pos_prev_agg = pos_prev_agg.reset_index()

pos_prev_agg.shape

(936325, 32)

## Export

In [11]:
POS_PREV_FINAL_PATH = DATA_PROCESSED / "pos_cash_by_prev.csv"
pos_prev_agg.to_csv(POS_PREV_FINAL_PATH, index=False)
print("Export niveau SK_ID_PREV terminé :", POS_PREV_FINAL_PATH)

Export niveau SK_ID_PREV terminé : c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\pos_cash_by_prev.csv
