# Build Light Dataset (LendingClub)

Objectif : créer un dataset allégé (colonnes utiles + échantillon stable) pour exécuter le projet facilement (Docker inclus).


In [3]:
import pandas as pd
import os

os.makedirs("../data/interim", exist_ok=True)

usecols = [
    "loan_status", "loan_amnt", "term", "int_rate", "annual_inc",
    "grade", "addr_state", "dti"
]

df = pd.read_csv("../data/raw/lendingclub.csv", usecols=usecols, low_memory=False)
df.shape


(2260701, 8)

In [4]:
valid_status = ["Fully Paid", "Charged Off", "Default"]
df = df[df["loan_status"].isin(valid_status)].copy()

# Échantillon stable (ex: 250000 lignes). Ajuste si besoin.
n = min(250_000, len(df))
df_light = df.sample(n=n, random_state=42)

df_light["loan_status"].value_counts(normalize=True), df_light.shape


(loan_status
 Fully Paid     0.800412
 Charged Off    0.199560
 Default        0.000028
 Name: proportion, dtype: float64,
 (250000, 8))

In [5]:
out_path = "data/interim/lendingclub_light.csv"
df_light.to_csv(out_path, index=False)
out_path


'data/interim/lendingclub_light.csv'