# End-to-end walkthrough: credit scoring pipeline

Эта тетрадь пошагово повторяет действующий пайплайн скора, опираясь только на готовые функции проекта: от загрузки данных до explainability артефактов.

## План
1. Настройка окружения и импорт модулей.
2. Разбор `configs/default.yaml` и ключевых гиперпараметров.
3. Ingest + master-table и первичный EDA.
4. Временной сплит train/valid/OOT.
5. Feature engineering (`build_features`) и визуализации.
6. Подготовка матриц X/y и отбор признаков.
7. Обучение Champion/Challenger моделей.
8. Метрики качества + ROC/PR/калибровка/лифт.
9. Explainability: коэффициенты, reason codes, SHAP/feature importance.

In [1]:
import os
import sys
import json
from pathlib import Path

import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "configs").exists():
    for parent in PROJECT_ROOT.parents:
        if (parent / "configs").exists():
            PROJECT_ROOT = parent
            break
    else:
        raise RuntimeError("Не удалось найти корень проекта с папкой configs/")
os.chdir(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.utils import load_config, seed_everything
from src.data_loading import load_master_dataset, build_credit_history_features
from src.modeling import (
    split_by_time,
    train_logistic_woe,
    train_catboost,
    train_lgbm,
)
from scripts.train import (
    _augment_with_features,
    _select_feature_columns,
    _align_frames,
    _run_feature_diagnostics,
    _drop_sensitive,
)
from src.metrics import (
    compute_metrics,
    report_to_dict,
    reliability_curve,
    lift_curve,
)
from src.explainability import champion_coefficients
from sklearn.metrics import roc_curve, precision_recall_curve

pd.options.display.max_columns = 120
sns.set_theme(style="whitegrid")
plt.style.use("seaborn-v0_8")
print(f"Проект: {PROJECT_ROOT}")

Проект: /home/zerotwo/ml-coding-hack


## Шаг 2. Конфигурация пайплайна
Разбираем `configs/default.yaml`: источники данных, таргет, временные границы, параметры feature engineering и моделей.

In [2]:
config_path = PROJECT_ROOT / "configs" / "default.yaml"
config = load_config(config_path)
seed_everything(config.get("seed", 42))
print(f"Используем конфиг: {config_path}")

sections = {
    "paths": config.get("paths", {}),
    "data_sources": config.get("data_sources", {}),
    "target": config.get("target", {}),
    "split": config.get("split", {}),
    "feature_engineering": config.get("feature_engineering", {}),
    "modeling": config.get("modeling", {}),
    "feature_selection": config.get("feature_selection", {}),
    "calibration": config.get("calibration", {}),
}
for name, payload in sections.items():
    print("====", name.upper(), "====")
    print(yaml.safe_dump(payload, sort_keys=False, allow_unicode=True))

artifacts_dir = (PROJECT_ROOT / config["paths"]["artifacts_dir"]).resolve()
models_dir = (PROJECT_ROOT / config["paths"]["models_dir"]).resolve()
artifacts_dir.mkdir(parents=True, exist_ok=True)
models_dir.mkdir(parents=True, exist_ok=True)

SyntaxError: unterminated string literal (detected at line 17) (195548128.py, line 17)

## Шаг 3. Ingest и master-table
Построим master-table из всех источников, добавим агрегаты по кредитной истории и посмотрим базовый EDA: размеры, head(), таргет, пропуски и распределения.

In [3]:
target_col = config["target"]["column"]
date_col = config["split"]["date_column"]
id_col = config.get("merging", {}).get("id_col", "customer_ref")
app_col = config["split"].get("application_id_col", "application_id")

master_df, credit_history = load_master_dataset(config)
print(f"Master-table после мерджа справочников: {master_df.shape}")
if credit_history is not None and not credit_history.empty:
    credit_features = build_credit_history_features(credit_history, master_df, config)
    master_df = master_df.merge(credit_features, on=app_col, how="left")
    print(f"Добавили агрегаты кредитной истории: {master_df.shape}")

master_df[date_col] = pd.to_datetime(master_df[date_col], errors="coerce")
master_df = master_df.dropna(subset=[target_col, date_col]).sort_values(date_col).reset_index(drop=True)
print(f"После фильтра по {target_col}/{date_col}: {master_df.shape}")

display(master_df.head())

print("Распределение таргета (count/share):")
target_stats = master_df[target_col].value_counts(dropna=False).to_frame("count")
target_stats["share"] = target_stats["count"] / len(master_df)
display(target_stats)

key_cols = [id_col, app_col, target_col, date_col, "loan_amount", "annual_income", "credit_utilization", "debt_to_income_ratio"]
missing_info = {
    col: int(master_df[col].isna().sum())
    for col in key_cols
    if col in master_df.columns
}
display(pd.DataFrame.from_dict(missing_info, orient="index", columns=["missing_rows"]))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.countplot(x=target_col, data=master_df, ax=axes[0])
axes[0].set_title("Таргет 0/1")
axes[0].set_xlabel("default")
monthly = master_df.set_index(date_col).resample("M").size()
axes[1].plot(monthly.index, monthly.values)
axes[1].set_title("Количество заявок по месяцам")
axes[1].set_ylabel("# заявок")
plt.tight_layout()
plt.show()

feature_candidates = ["loan_amount", "annual_income", "credit_utilization", "debt_to_income_ratio"]
available = [col for col in feature_candidates if col in master_df.columns]
if available:
    sample = master_df[available].sample(min(len(master_df), 5000), random_state=42)
    fig, axes = plt.subplots(1, len(available), figsize=(4 * len(available), 3))
    if len(available) == 1:
        axes = [axes]
    for ax, col in zip(axes, available):
        sns.histplot(sample[col], bins=30, ax=ax, kde=False)
        ax.set_title(col)
    plt.tight_layout()
    plt.show()

NameError: name 'config' is not defined

## Шаг 4. Временной сплит
Используем `split_by_time` из `src.modeling` и границы из конфига, чтобы получить train/valid/OOT. Проверим размеры, диапазоны дат и долю дефолтов для каждого среза.

In [None]:
train_end = config["split"]["train_end"]
valid_end = config["split"]["valid_end"]
raw_splits = split_by_time(master_df, date_col, train_end, valid_end)

summary = []
for name, frame in raw_splits.items():
    if frame.empty:
        continue
    date_range = frame[date_col].agg(["min", "max"])
    summary.append(
        {
            "split": name,
            "rows": len(frame),
            "date_start": date_range["min"],
            "date_end": date_range["max"],
            "default_rate": frame[target_col].mean(),
        }
    )
display(pd.DataFrame(summary))

## Шаг 5. Feature engineering
Воспользуемся `_augment_with_features`, который внутри вызывает `build_features` и добавляет ratio/rolling/trend/behavioral признаки. Покажем, сколько новых фич появилось и как они распределены.

In [None]:
feature_cfg = config.get("feature_engineering", {})
client_id_col = config["split"].get("client_id_col")
MAX_ROWS_PER_SPLIT = None  # при необходимости можно ограничить размер срезов для быстрой отладки

processed_splits = {}
for name, frame in raw_splits.items():
    subset = frame.copy()
    if MAX_ROWS_PER_SPLIT:
        subset = subset.iloc[: min(MAX_ROWS_PER_SPLIT, len(subset))].copy()
    processed_splits[name] = subset

augmented_splits = {}
feature_stats = []
for name, frame in processed_splits.items():
    augmented = _augment_with_features(frame, feature_cfg, date_col, client_id_col)
    augmented_splits[name] = augmented
    engineered_cols = [col for col in augmented.columns if col not in frame.columns]
    feature_stats.append(
        {
            "split": name,
            "rows": len(augmented),
            "baseline_cols": len(frame.columns),
            "engineered_cols": len(engineered_cols),
            "total_cols": len(augmented.columns),
        }
    )
display(pd.DataFrame(feature_stats))

train_engineered = [col for col in augmented_splits["train"].columns if col not in raw_splits["train"].columns]
print(f"Новых признаков в train: {len(train_engineered)}")
display(augmented_splits["train"][train_engineered[:10]].head())

stat_cols = train_engineered[:8]
if stat_cols:
    display(augmented_splits["train"][stat_cols].describe().T)

feature_strength_path = artifacts_dir / "feature_strength.json"
feature_strength = {}
if feature_strength_path.exists():
    with open(feature_strength_path, "r", encoding="utf-8") as fh:
        feature_strength = json.load(fh)
    top_auc_cols = [item["feature"] for item in feature_strength.get("top_auc", []) if item["feature"] in augmented_splits["train"].columns][:6]
else:
    top_auc_cols = train_engineered[:6]

if top_auc_cols:
    corr_sample = augmented_splits["train"][top_auc_cols].sample(min(len(augmented_splits["train"]), 5000), random_state=42)
    corr = corr_sample.corr()
    plt.figure(figsize=(6, 5))
    sns.heatmap(corr, annot=True, cmap="crest", fmt=".2f")
    plt.title("Корреляции топовых engineered фич")
    plt.show()

## Шаг 6. Матрицы X/y и отбор фич
Повторяем продакшн-логику: фильтруем потенциально утечные признаки, выравниваем срезы, запускаем `_run_feature_diagnostics`, убираем подозрительные и чувствительные столбцы.

In [None]:
id_like = config.get("merging", {}).get("id_like_cols", [])
forbidden_ids = [col for col in [id_col, app_col] if col]
forbidden_ids.extend([col for col in id_like if col])
forbidden_ids = list(dict.fromkeys(forbidden_ids))

safe_columns = _select_feature_columns(augmented_splits, target_col, date_col, forbidden_ids)
aligned_splits = _align_frames(augmented_splits, safe_columns)

diagnostics = _run_feature_diagnostics(
    aligned_splits["train"],
    aligned_splits["valid"],
    target_col,
    date_col,
    artifacts_dir,
    config.get("feature_selection", {}),
)
drop_candidates = diagnostics.get("drop_columns", [])
if drop_candidates:
    for name in aligned_splits:
        aligned_splits[name] = aligned_splits[name].drop(columns=drop_candidates, errors="ignore")

sensitive_cols = config.get("fairness", {}).get("sensitive_cols", [])
if sensitive_cols:
    for name in aligned_splits:
        aligned_splits[name] = _drop_sensitive(aligned_splits[name], sensitive_cols)

train_df = aligned_splits["train"].copy()
valid_df = aligned_splits["valid"].copy()
oot_df = aligned_splits["oot"].copy()

model_features = [col for col in train_df.columns if col != target_col]
print(f"Финальный train shape: {train_df.shape}, число фич без таргета: {len(model_features)}")

if feature_strength:
    top_auc = pd.DataFrame(feature_strength.get("top_auc", [])[:10])
    display(top_auc)

print("Диагностика single-feature:")
print(json.dumps({k: diagnostics.get(k) for k in ["suspicious", "low_variance", "weak_auc"]}, indent=2)[:1000])

## Шаг 7. Champion / Challenger
Обучаем логистическую регрессию с WOE (champion) и деревья (CatBoost, LightGBM) теми же функциями, что и продакшн. Сравниваем валидационные метрики.

In [None]:
model_cfg = config.get("modeling", {})
cv_folds = model_cfg.get("cv_folds", 1)
group_col = client_id_col if client_id_col and client_id_col in train_df.columns else None

champion_model = train_logistic_woe(train_df, valid_df, target_col, model_cfg.get("logistic", {}))
cat_model = train_catboost(
    train_df,
    valid_df,
    target_col,
    model_cfg.get("catboost", {}),
    date_col,
    group_col,
    cv_folds,
)
lgbm_model = train_lgbm(
    train_df,
    valid_df,
    target_col,
    model_cfg.get("lightgbm", {}),
    date_col,
    group_col,
    cv_folds,
)

trained_models = [m for m in [champion_model, cat_model, lgbm_model] if m is not None]
metrics_table = pd.DataFrame(
    [
        {"model": m.name, **m.metrics}
        for m in trained_models
    ]
)
display(metrics_table)

challengers = [m for m in trained_models if m.name != "logistic_woe"]
best_challenger = max(challengers, key=lambda m: m.metrics.get("roc_auc", 0.0)) if challengers else None
if best_challenger:
    print(f"Лучший challenger: {best_challenger.name} (ROC-AUC={best_challenger.metrics['roc_auc']:.4f})")
else:
    print("Челленджеры недоступны (catboost/lightgbm не установлены)")

## Шаг 8. Метрики и визуализации
Считаем ROC/PR/калибровку/лифт по champion и лучшему challenger (если он есть). Параллельно загружаем готовые `artifacts/metrics.json`, чтобы сравнить с тренировочным пайплайном.

In [None]:
valid_y = valid_df[target_col].values
oot_y = oot_df[target_col].values

predictions = {}
eval_rows = []
for model in trained_models:
    valid_pred = model.predict_proba(valid_df)
    oot_pred = model.predict_proba(oot_df)
    predictions[model.name] = {"valid": valid_pred, "oot": oot_pred}
    valid_report = report_to_dict(compute_metrics(valid_y, valid_pred))
    oot_report = report_to_dict(compute_metrics(oot_y, oot_pred))
    eval_rows.append({"model": model.name, **{f"valid_{k}": v for k, v in valid_report.items()}, **{f"oot_{k}": v for k, v in oot_report.items()}})

eval_df = pd.DataFrame(eval_rows)
display(eval_df)

metrics_path = artifacts_dir / "metrics.json"
if metrics_path.exists():
    with open(metrics_path, "r", encoding="utf-8") as fh:
        stored_metrics = json.load(fh)
    print("Метрики, сохранённые train-пайплайном:")
    display(stored_metrics)

plot_models = [champion_model.name]
if best_challenger is not None:
    plot_models.append(best_challenger.name)

plt.figure(figsize=(6, 5))
for name in plot_models:
    fpr, tpr, _ = roc_curve(valid_y, predictions[name]["valid"])
    auc_value = eval_df.loc[eval_df["model"] == name, "valid_roc_auc"].values[0]
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc_value:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.title("ROC-кривые (valid)")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.show()

plt.figure(figsize=(6, 5))
for name in plot_models:
    precision, recall, _ = precision_recall_curve(valid_y, predictions[name]["valid"])
    plt.plot(recall, precision, label=name)
plt.title("PR-кривые (valid)")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.show()

champ_name = champion_model.name
rel_valid = reliability_curve(valid_y, predictions[champ_name]["valid"], n_bins=10)
rel_oot = reliability_curve(oot_y, predictions[champ_name]["oot"], n_bins=10)
plt.figure(figsize=(5, 4))
plt.plot(rel_valid["pred"], rel_valid["true"], marker="o", label="valid")
plt.plot(rel_oot["pred"], rel_oot["true"], marker="s", label="oot")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.title("Reliability curve: champion")
plt.xlabel("Predicted PD")
plt.ylabel("Observed PD")
plt.legend()
plt.show()

lift_valid = lift_curve(valid_y, predictions[champ_name]["valid"], n_bins=10)
lift_df = pd.DataFrame({"decile": range(1, len(lift_valid["lift"]) + 1), "lift": lift_valid["lift"], "gain": lift_valid["gain"]})
display(lift_df.head(10))
plt.figure(figsize=(5, 4))
plt.plot(lift_df["decile"], lift_df["lift"], marker="o")
plt.title("Lift@децили (champion, valid)")
plt.xlabel("Дециль")
plt.ylabel("Lift")
plt.show()

## Шаг 9. Explainability
Используем готовые инструменты: коэффициенты champion (и reason codes из артефакта), а также SHAP/feature importance challenger’а из `artifacts/challenger_shap.json`.

In [None]:
champion_coefs = pd.DataFrame(champion_coefficients(champion_model))
champion_coefs["abs_coef"] = champion_coefs["coefficient"].abs()
champion_top = champion_coefs.sort_values("abs_coef", ascending=False).head(15)
display(champion_top)

plt.figure(figsize=(6, 5))
sns.barplot(data=champion_top, x="coefficient", y="feature", palette="viridis")
plt.title("Champion: топ коэффициенты (WOE space)")
plt.show()

champion_explain_path = artifacts_dir / "champion_explainability.json"
if champion_explain_path.exists():
    with open(champion_explain_path, "r", encoding="utf-8") as fh:
        champion_explain = json.load(fh)
    print("Reason codes sample:")
    display(champion_explain.get("reason_codes", [])[:5])

shap_path = artifacts_dir / "challenger_shap.json"
if shap_path.exists():
    with open(shap_path, "r", encoding="utf-8") as fh:
        shap_payload = json.load(fh)
    shap_df = pd.DataFrame(shap_payload.get("feature_importance", [])).head(15)
    if not shap_df.empty:
        display(shap_df)
        value_col = [col for col in shap_df.columns if col != "feature"][0]
        plt.figure(figsize=(6, 5))
        sns.barplot(data=shap_df, x=value_col, y="feature", palette="mako")
        plt.title("Challenger: SHAP/feature importance")
        plt.show()
else:
    print("SHAP артефакт не найден — возможно, challenger не обучался в предыдущем run")