# installement

Cette table décrit les paiements d’échéances pour les crédits précédents. Nous avons calculé, au niveau SK_ID_CURR, les retards moyens et maximums, les paiements anticipés, les ratios payé/dû et le nombre total de paiements. Un flag INST_NO_RECORDS indique l’absence d’historique d’échéances. Ces agrégations permettent d’évaluer la ponctualité et la fiabilité du client dans le remboursement de ses crédits.


In [3]:

from pathlib import Path
import pandas as pd
import numpy as np
import warnings

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [4]:
CWD = Path.cwd()

PROJECT_ROOT = CWD.parent.parent 


DATA_RAW       = PROJECT_ROOT / "data" / "raw"
DATA_CLEAN     = PROJECT_ROOT / "data" / "clean"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

DATA_CLEAN.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("DATA_RAW       =", DATA_RAW)
print("DATA_PROCESSED =", DATA_PROCESSED)

DATA_RAW       = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\raw
DATA_PROCESSED = c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed


In [5]:
INST_PATH = DATA_RAW / "installments_payments.csv"
assert INST_PATH.exists(), f"Fichier introuvable : {INST_PATH}"

inst = pd.read_csv(INST_PATH)
print("installments_payments:", inst.shape)
inst.head()
inst.info()
inst_clean = inst.copy()

installments_payments: (13605401, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_PREV              int64  
 1   SK_ID_CURR              int64  
 2   NUM_INSTALMENT_VERSION  float64
 3   NUM_INSTALMENT_NUMBER   int64  
 4   DAYS_INSTALMENT         float64
 5   DAYS_ENTRY_PAYMENT      float64
 6   AMT_INSTALMENT          float64
 7   AMT_PAYMENT             float64
dtypes: float64(5), int64(3)
memory usage: 830.4 MB


## Typage

In [6]:
inst_clean["SK_ID_PREV"] = pd.to_numeric(inst_clean["SK_ID_PREV"], errors="coerce").astype("Int64")
inst_clean["SK_ID_CURR"] = pd.to_numeric(inst_clean["SK_ID_CURR"], errors="coerce").astype("Int64")

num_to_numeric = [
    "NUM_INSTALMENT_VERSION",
    "NUM_INSTALMENT_NUMBER",
    "DAYS_INSTALMENT",
    "DAYS_ENTRY_PAYMENT",
    "AMT_INSTALMENT",
    "AMT_PAYMENT",
]
for col in num_to_numeric:
    inst_clean[col] = pd.to_numeric(inst_clean[col], errors="coerce")

## Flags NA

In [7]:
inst_clean["INST_AMT_INSTALMENT_NA"]     = inst_clean["AMT_INSTALMENT"].isna().astype(int)
inst_clean["INST_AMT_PAYMENT_NA"]        = inst_clean["AMT_PAYMENT"].isna().astype(int)
inst_clean["INST_DAYS_INSTALMENT_NA"]    = inst_clean["DAYS_INSTALMENT"].isna().astype(int)
inst_clean["INST_DAYS_ENTRY_PAYMENT_NA"] = inst_clean["DAYS_ENTRY_PAYMENT"].isna().astype(int)

inst_clean["AMT_PAYMENT"] = inst_clean["AMT_PAYMENT"].fillna(0)

# Option baseline : remplacer par 0
inst_clean["DAYS_ENTRY_PAYMENT"] = inst_clean["DAYS_ENTRY_PAYMENT"].fillna(0)

## Features

In [8]:


# Retard réel
inst_clean["INST_DAYSLATE"] = inst_clean["DAYS_ENTRY_PAYMENT"] - inst_clean["DAYS_INSTALMENT"]

# Différence paiement - dû
inst_clean["INST_PAYMENTDIFF"] = inst_clean["AMT_PAYMENT"] - inst_clean["AMT_INSTALMENT"]

# Ratio paiement / dû (protégé)
inst_clean["INST_PAYMENTRATIO"] = inst_clean["AMT_PAYMENT"] / inst_clean["AMT_INSTALMENT"].replace(0, np.nan)

# ============================
# 5) Flags comportementaux
# ============================
inst_clean["INST_PAID_LATE"]  = (inst_clean["INST_DAYSLATE"] > 0).astype(int)
inst_clean["INST_PAID_EARLY"] = (inst_clean["INST_DAYSLATE"] < 0).astype(int)

inst_clean["INST_UNDERPAID"]  = (inst_clean["INST_PAYMENTDIFF"] < 0).astype(int)
inst_clean["INST_EXACTPAY"]   = (inst_clean["INST_PAYMENTDIFF"] == 0).astype(int)
inst_clean["INST_OVERPAID"]   = (inst_clean["INST_PAYMENTDIFF"] > 0).astype(int)

inst_clean["INST_ZERO_PAYMENT"] = (inst_clean["AMT_PAYMENT"] == 0).astype(int)

inst_clean.shape

(13605401, 21)

In [9]:
## AGREGATION NUMERIQUES

In [10]:
num_cols = [
    "AMT_INSTALMENT", "AMT_PAYMENT",
    "INST_PAYMENTDIFF", "INST_PAYMENTRATIO",
    "DAYS_INSTALMENT", "DAYS_ENTRY_PAYMENT",
    "INST_DAYSLATE",
]

agg_funcs = ["mean", "min", "max", "sum", "std"]

agg_prev_num = inst_clean.groupby("SK_ID_PREV")[num_cols].agg(agg_funcs)

# Aplatir les noms proprement
agg_prev_num.columns = [f"INST_{col}_{stat}".upper() for col, stat in agg_prev_num.columns]

## Agrégations count

In [11]:
agg_prev_counts = inst_clean.groupby("SK_ID_PREV").agg(
    INST_COUNT_PAYMENTS=("SK_ID_PREV", "count"),
    INST_COUNT_LATE=("INST_PAID_LATE", "sum"),
    INST_COUNT_EARLY=("INST_PAID_EARLY", "sum"),
    INST_COUNT_UNDERPAID=("INST_UNDERPAID", "sum"),
    INST_COUNT_OVERPAID=("INST_OVERPAID", "sum"),
    INST_COUNT_ZERO=("INST_ZERO_PAYMENT", "sum"),
)

## Ratio

In [12]:
denom = agg_prev_counts["INST_COUNT_PAYMENTS"].replace(0, np.nan)

agg_prev_ratios = pd.DataFrame(index=agg_prev_counts.index)
agg_prev_ratios["INST_RATIO_LATE"]      = agg_prev_counts["INST_COUNT_LATE"] / denom
agg_prev_ratios["INST_RATIO_EARLY"]     = agg_prev_counts["INST_COUNT_EARLY"] / denom
agg_prev_ratios["INST_RATIO_UNDERPAID"] = agg_prev_counts["INST_COUNT_UNDERPAID"] / denom
agg_prev_ratios["INST_RATIO_OVERPAID"]  = agg_prev_counts["INST_COUNT_OVERPAID"] / denom

agg_prev_ratios = agg_prev_ratios.fillna(0)

In [13]:
agg_prev_time = inst_clean.groupby("SK_ID_PREV").agg(
    INST_LASTPAYMENT_DAYS=("DAYS_ENTRY_PAYMENT", "max"),
    INST_FIRSTPAYMENT_DAYS=("DAYS_ENTRY_PAYMENT", "min"),
)

agg_prev_time["INST_PAYMENTSPAN"] = (
    agg_prev_time["INST_LASTPAYMENT_DAYS"] - agg_prev_time["INST_FIRSTPAYMENT_DAYS"]
)



## Fusion

In [14]:
inst_prev_agg = (
    agg_prev_num
    .join(agg_prev_counts)
    .join(agg_prev_ratios)
    .join(agg_prev_time)
)

## Flag

In [15]:
inst_prev_agg["INST_NO_RECORDS"] = inst_prev_agg.isna().all(axis=1).astype(int)

inst_prev_agg = inst_prev_agg.reset_index()
inst_prev_agg.shape

(997752, 50)

## Export

In [16]:
INST_FINAL_PATH = DATA_PROCESSED / "installments_payments_final.csv"
inst_prev_agg.to_csv(INST_FINAL_PATH, index=False)
print("Export terminé :", INST_FINAL_PATH)

Export terminé : c:\Users\yoann\Documents\open classrooms\projet 6\pret a depenser v2\data\processed\installments_payments_final.csv
