In [None]:
%%configure

{
    "conf": {
        "spark.driver.maxResultSize": "8g",
        "spark.driver.memory": "54g",
        "spark.driver.cores": 8,
        "spark.executor.instances": 0,
        "spark.sql.execution.arrow.pyspark.enabled": "true",
        "spark.sql.execution.arrow.pyspark.selfDestruct.enabled": "true"
    }
}

# Modelo Baseline de Risco – Telecom (v3 — Spark Read + Pandas Modeling)

Este notebook implementa um **modelo baseline de risco de inadimplencia (FPD)** para clientes Claro.

### Diferencas em relacao ao v1 e v2
- **v1**: PySpark para limpeza + Pandas para modelagem (original)
- **v2**: Python puro com `deltalake` (delta-rs) — falha no Fabric por falta de Spark context para MLflow
- **v3 (este)**: Usa `spark.read` para carregar dados, converte para Pandas, e segue com modelagem 100% Pandas
- Codigo limpo e organizado do v2, compatibilidade Fabric do v1

### Otimizacao de Memoria (toPandas)
O dataset completo (3.9M x 402 cols) gera ~8.8 GiB serializado, excedendo `spark.driver.maxResultSize`.
Estrategia de 5 camadas aplicada:
1. **Arrow + selfDestruct** — serializacao 2-3x mais eficiente
2. **Drop colunas audit/leakage no Spark** — menos dados para serializar
3. **FLAG_INSTALACAO == 1 filtrado no Spark** — reduz ~30-40% das linhas antes do toPandas
4. **Cast Double→Float, Long→Int no Spark** — metade do tamanho numerico
5. **Conversao chunked por SAFRA** + gc.collect() entre chunks (~1 GiB por chunk)

### Principais etapas
- Leitura do Gold Feature Store via `spark.read.format("delta")`
- Amostragem de **25%** por safra e FPD, de forma estratificada
- Separacao em **Treino (2024-10 a 2024-12)**, **Validacao (2025-01)**, **OOS** e **OOT (2025-02 e 2025-03)**
- Modelos: Logistic Regression (L1) + LightGBM (GBDT)
- Ajuste de hiperparametros no conjunto de validacao
- Metricas: AUC, KS, Gini
- Analise incremental de KS por fonte de dados
- Feature selection: IV + L1 coefs + alta correlacao
- Swap analysis OOT1 vs OOT2
- Export via MLflow Registry

In [None]:
# =============================================================================
# 1. IMPORTS E CONFIGURACAO
# =============================================================================
import pandas as pd
import numpy as np
import re
import gc
import shap
import pickle
import mlflow
import mlflow.sklearn
import matplotlib
from matplotlib.patches import Patch
%matplotlib inline
import matplotlib.pyplot as plt
import warnings

from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, IntegerType, DoubleType, LongType

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from lightgbm import LGBMClassifier
from category_encoders import CountEncoder

warnings.filterwarnings('ignore')

# Config centralizado do pipeline
import sys; sys.path.insert(0, '/lakehouse/default/Files/projeto-final')
from config.pipeline_config import (
    PATH_FEATURE_STORE, EXPERIMENT_NAME, SAFRAS,
    LEAKAGE_BLACKLIST, TARGET_COLUMNS
)

print('Imports OK')

In [None]:
# =============================================================================
# 2. MLFLOW SETUP
# =============================================================================
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog(disable=True)  # Controle manual para evitar conflitos

print(f'MLflow experiment: {EXPERIMENT_NAME}')
print(f'Tracking URI: {mlflow.get_tracking_uri()}')

In [None]:
# =============================================================================
# 3. LEITURA OTIMIZADA DO GOLD FEATURE STORE (Spark -> Pandas)
# =============================================================================
# Problema: toPandas() direto com 3.9M x 402 cols gera ~8.8 GiB serializado,
# estourando spark.driver.maxResultSize (8 GiB).
#
# Estrategia de otimizacao (5 camadas):
#   1. Arrow + selfDestruct — serializacao 2-3x mais eficiente
#   2. Drop colunas audit/leakage no Spark — menos dados para serializar
#   3. Filtrar FLAG_INSTALACAO == 1 no Spark — reduz ~30-40% das linhas
#   4. Cast Double->Float, Long->Int via .select() — metade do tamanho numerico
#   5. Conversao chunked por SAFRA + gc.collect() entre chunks

# ---- 3.1 Habilitar Arrow para toPandas ----
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "true")

print(f'Lendo feature store de: {PATH_FEATURE_STORE}\n')

df_spark = spark.read.format("delta").load(PATH_FEATURE_STORE)
n_original = df_spark.count()
print(f'Original: {n_original:,} rows x {len(df_spark.columns)} cols')

# ---- 3.2 Drop colunas audit + leakage no Spark ----
cols_audit = ['_execution_id', '_data_inclusao', '_data_alteracao_silver', 'DT_PROCESSAMENTO']
cols_drop = [c for c in cols_audit + LEAKAGE_BLACKLIST if c in df_spark.columns]
if cols_drop:
    df_spark = df_spark.drop(*cols_drop)
    print(f'Drop {len(cols_drop)} colunas (audit+leakage): {cols_drop}')

# ---- 3.3 Filtrar FLAG_INSTALACAO == 1 no Spark (ANTES do toPandas) ----
# Essa filtragem aqui (e nao no Pandas) reduz significativamente o volume
n_reprovados = 0
if 'FLAG_INSTALACAO' in df_spark.columns:
    n_reprovados = df_spark.filter(F.col('FLAG_INSTALACAO') == 0).count()
    df_spark = df_spark.filter(F.col('FLAG_INSTALACAO') == 1).drop('FLAG_INSTALACAO')
    n_pos = n_original - n_reprovados  # evita .count() redundante
    print(f'FLAG_INSTALACAO: {n_original:,} -> {n_pos:,} pos-instalacao ({n_reprovados:,} reprovados removidos)')
else:
    n_pos = n_original

# ---- 3.4 Cast tipos via .select() (plano flat, evita deep Spark plan) ----
# withColumn sequencial com 300+ colunas gera arvore profunda no Catalyst.
# .select() com lista de expressoes gera plano flat (1 no).
cast_exprs = []
n_double, n_long = 0, 0
for field in df_spark.schema.fields:
    if isinstance(field.dataType, DoubleType):
        cast_exprs.append(F.col(field.name).cast(FloatType()).alias(field.name))
        n_double += 1
    elif isinstance(field.dataType, LongType):
        cast_exprs.append(F.col(field.name).cast(IntegerType()).alias(field.name))
        n_long += 1
    else:
        cast_exprs.append(F.col(field.name))
df_spark = df_spark.select(*cast_exprs)

print(f'Cast tipos (.select flat): {n_double} Double->Float, {n_long} Long->Int')

# ---- 3.5 Conversao chunked por SAFRA ----
safras_disponiveis = sorted([row.SAFRA for row in df_spark.select('SAFRA').distinct().collect()])
print(f'\nSAFRAs: {safras_disponiveis} | Colunas: {len(df_spark.columns)}')
print('Convertendo por SAFRA...')

chunks = []
for safra in safras_disponiveis:
    chunk = df_spark.filter(F.col('SAFRA') == safra).toPandas()
    mem_mb = chunk.memory_usage(deep=True).sum() / 1e6
    print(f'  SAFRA {safra}: {len(chunk):,} rows | {mem_mb:.0f} MB')
    chunks.append(chunk)
    gc.collect()

df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f'\nDataset carregado (apenas pos-instalacao):')
print(f'  Shape: {df.shape}')
print(f'  Memory: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB')
print(f'  Reprovados (nao carregados): {n_reprovados:,}')

## 4. Limpeza de Dados (Pandas puro)

In [None]:
# =============================================================================
# 4.1 FUNCOES DE LIMPEZA
# =============================================================================

def clean_empty_keys(df):
    """Remove registros com chaves (CPF, SAFRA) vazias."""
    return df.dropna(subset=['NUM_CPF', 'SAFRA'])


def convert_cep3_uf_regiao(df):
    """Converte CEP_3_digitos em UF e Regiao via mapeamento."""
    cep_map = {
        '01': ('SP', 'SUDESTE'), '02': ('SP', 'SUDESTE'), '03': ('SP', 'SUDESTE'),
        '04': ('SP', 'SUDESTE'), '05': ('SP', 'SUDESTE'), '06': ('SP', 'SUDESTE'),
        '07': ('SP', 'SUDESTE'), '08': ('SP', 'SUDESTE'), '09': ('SP', 'SUDESTE'),
        '20': ('RJ', 'SUDESTE'), '21': ('RJ', 'SUDESTE'), '22': ('RJ', 'SUDESTE'),
        '23': ('RJ', 'SUDESTE'), '24': ('RJ', 'SUDESTE'),
        '29': ('ES', 'SUDESTE'),
        '30': ('MG', 'SUDESTE'), '31': ('MG', 'SUDESTE'), '32': ('MG', 'SUDESTE'),
        '33': ('MG', 'SUDESTE'), '34': ('MG', 'SUDESTE'), '35': ('MG', 'SUDESTE'),
        '36': ('MG', 'SUDESTE'), '37': ('MG', 'SUDESTE'), '38': ('MG', 'SUDESTE'),
        '39': ('MG', 'SUDESTE'),
        '40': ('BA', 'NORDESTE'), '41': ('BA', 'NORDESTE'), '42': ('BA', 'NORDESTE'),
        '43': ('BA', 'NORDESTE'), '44': ('BA', 'NORDESTE'), '45': ('BA', 'NORDESTE'),
        '46': ('BA', 'NORDESTE'), '47': ('BA', 'NORDESTE'), '48': ('BA', 'NORDESTE'),
        '49': ('SE', 'NORDESTE'),
        '50': ('PE', 'NORDESTE'), '51': ('PE', 'NORDESTE'), '52': ('PE', 'NORDESTE'),
        '53': ('PE', 'NORDESTE'), '54': ('PE', 'NORDESTE'), '55': ('PE', 'NORDESTE'),
        '56': ('AL', 'NORDESTE'), '57': ('AL', 'NORDESTE'),
        '58': ('PB', 'NORDESTE'), '59': ('RN', 'NORDESTE'),
        '60': ('CE', 'NORDESTE'), '61': ('CE', 'NORDESTE'), '62': ('CE', 'NORDESTE'),
        '63': ('PI', 'NORDESTE'), '64': ('PI', 'NORDESTE'),
        '65': ('MA', 'NORDESTE'),
        '66': ('PA', 'NORTE'), '67': ('PA', 'NORTE'),
        '68': ('AC', 'NORTE'), '69': ('AM', 'NORTE'),
        '77': ('TO', 'NORTE'),
        '70': ('DF', 'CENTRO-OESTE'), '71': ('DF', 'CENTRO-OESTE'),
        '72': ('GO', 'CENTRO-OESTE'), '73': ('GO', 'CENTRO-OESTE'),
        '74': ('GO', 'CENTRO-OESTE'), '75': ('GO', 'CENTRO-OESTE'),
        '76': ('GO', 'CENTRO-OESTE'),
        '78': ('MT', 'CENTRO-OESTE'), '79': ('MS', 'CENTRO-OESTE'),
        '80': ('PR', 'SUL'), '81': ('PR', 'SUL'), '82': ('PR', 'SUL'),
        '83': ('PR', 'SUL'), '84': ('PR', 'SUL'), '85': ('PR', 'SUL'),
        '86': ('PR', 'SUL'), '87': ('PR', 'SUL'),
        '88': ('SC', 'SUL'), '89': ('SC', 'SUL'),
        '90': ('RS', 'SUL'), '91': ('RS', 'SUL'), '92': ('RS', 'SUL'),
        '93': ('RS', 'SUL'), '94': ('RS', 'SUL'), '95': ('RS', 'SUL'),
        '96': ('RS', 'SUL'), '97': ('RS', 'SUL'), '98': ('RS', 'SUL'),
        '99': ('RS', 'SUL'),
    }
    if 'CEP_3_digitos' not in df.columns:
        return df
    cep2 = df['CEP_3_digitos'].astype(str).str[:2]
    mapped = cep2.map(cep_map)
    df['UF'] = mapped.apply(lambda x: x[0] if isinstance(x, tuple) else 'OUTROS')
    df['REGIAO'] = mapped.apply(lambda x: x[1] if isinstance(x, tuple) else 'OUTROS')
    df = df.drop(columns=['CEP_3_digitos'])
    return df


def adjust_and_drop_date_cols(df):
    """Cria features datediff e remove colunas de data."""
    if 'var_12' in df.columns:
        df['var_12'] = pd.to_datetime(df['var_12'], format='%d/%m/%Y', errors='coerce')
    df['DATA_REF_SAFRA'] = pd.to_datetime(df['SAFRA'].astype(str), format='%Y%m')
    if 'var_12' in df.columns:
        df['DIAS_VAR_12'] = (df['DATA_REF_SAFRA'] - df['var_12']).dt.days
    if 'PAG_DT_PRIMEIRA_FATURA' in df.columns:
        df['PAG_DT_PRIMEIRA_FATURA'] = pd.to_datetime(df['PAG_DT_PRIMEIRA_FATURA'], errors='coerce')
        df['PAG_DIAS_DESDE_PRIMEIRA_FATURA'] = (df['DATA_REF_SAFRA'] - df['PAG_DT_PRIMEIRA_FATURA']).dt.days
    date_cols = df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
    date_cols.append('DATA_REF_SAFRA')
    df = df.drop(columns=[c for c in date_cols if c in df.columns])
    return df


def remove_high_missing(df, threshold=0.75):
    """Remove colunas com mais de threshold% de missing."""
    null_pct = df.isnull().mean()
    cols_to_drop = null_pct[null_pct >= threshold].index.tolist()
    print(f'  High missing (>= {threshold:.0%}): {len(cols_to_drop)} colunas removidas')
    return df.drop(columns=cols_to_drop)


def remove_low_cardinality(df):
    """Remove colunas com apenas 1 valor unico."""
    low_card = [c for c in df.columns if df[c].nunique() <= 1]
    print(f'  Low cardinality (== 1): {len(low_card)} colunas removidas')
    return df.drop(columns=low_card)


def remove_high_correlation(df, threshold=0.8, safras_train=None):
    """Remove colunas com correlacao > threshold (apenas nas safras de treino)."""
    if safras_train is not None:
        df_corr_base = df[df['SAFRA'].isin(safras_train)]
    else:
        df_corr_base = df
    df_sample = df_corr_base.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
        lambda x: x.sample(frac=0.25, random_state=42)
    )
    num_cols = df_sample.select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns
    num_cols = [c for c in num_cols if c != 'FPD']
    corr_matrix = df_sample[num_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = []
    while True:
        max_corr = upper.max().max()
        if max_corr < threshold:
            break
        col_to_drop = upper.max().sort_values(ascending=False).index[0]
        to_drop.append(col_to_drop)
        upper = upper.drop(index=col_to_drop, columns=col_to_drop)
    print(f'  High correlation (> {threshold}): {len(to_drop)} colunas removidas')
    return df.drop(columns=to_drop)


def remove_misused_columns(df):
    """Remove colunas de leakage e duplicatas conhecidas."""
    misused = ['PROD', 'flag_mig2', 'FAT_VLR_FPD', 'FAT_FLAG_MIG2_AQUISICAO']
    existing = [c for c in misused if c in df.columns]
    if existing:
        print(f'  Misused columns removed: {existing}')
    return df.drop(columns=existing, errors='ignore')

In [None]:
# =============================================================================
# 4.2 APLICAR LIMPEZAS
# =============================================================================
safras_train_val = SAFRAS[:4]  # 202410, 202411, 202412, 202501

print('Aplicando limpezas...')
print(f'Shape original: {df.shape}')

df = clean_empty_keys(df)
df = convert_cep3_uf_regiao(df)
df = adjust_and_drop_date_cols(df)
df = remove_high_missing(df)
df = remove_low_cardinality(df)
df = remove_high_correlation(df, threshold=0.8, safras_train=safras_train_val)
df = remove_misused_columns(df)

print(f'Shape apos limpezas: {df.shape}')

## 5. Split Temporal e Amostragem Estratificada

In [None]:
# =============================================================================
# 5. SPLIT TEMPORAL + AMOSTRAGEM ESTRATIFICADA
# =============================================================================
# Nota: FLAG_INSTALACAO ja filtrado no Spark (Cell 3) — df contem apenas pos-instalacao

# Separar safras
safras_ord = sorted(df['SAFRA'].unique())
safras_train_oos = safras_ord[:4]  # 202410-202501
safras_oot = safras_ord[4:]        # 202502-202503

df_4_safras = df[df['SAFRA'].isin(safras_train_oos)]
df_oot_full = df[df['SAFRA'].isin(safras_oot)]

# Amostragem estratificada 25% por (SAFRA, FPD)
df_sample = df_4_safras.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
    lambda x: x.sample(frac=0.25, random_state=42)
)
df_oos = df_4_safras.drop(df_sample.index)

# Reset indices
df_sample = df_sample.reset_index(drop=True)
df_oos = df_oos.reset_index(drop=True)
df_oot = df_oot_full.reset_index(drop=True)

# Remover duplicatas
df_sample = df_sample.drop_duplicates()
df_oos = df_oos.drop_duplicates()
df_oot = df_oot.drop_duplicates()

# Liberar df original (nao mais necessario apos split)
del df, df_4_safras, df_oot_full
gc.collect()

print(f'Shapes finais:')
print(f'  Sample (train+val): {df_sample.shape}')
print(f'  OOS:                {df_oos.shape}')
print(f'  OOT:                {df_oot.shape}')

In [None]:
# =============================================================================
# 5.1 VERIFICAR VOLUMETRIA POR SAFRA E TARGET
# =============================================================================
for name, data in [('Sample', df_sample), ('OOS', df_oos), ('OOT', df_oot)]:
    print(f'\n--- {name} ---')
    print(data[['SAFRA', 'FPD']].value_counts().sort_index().to_string())

## 6. Separacao Treino / Validacao e Preparacao X/Y

In [None]:
# =============================================================================
# 6. SEPARACAO TREINO / VALIDACAO / X / Y
# =============================================================================
safras_train = [202410, 202411, 202412]
safras_val = [202501]

df_train = df_sample[df_sample['SAFRA'].isin(safras_train)]
df_val = df_sample[df_sample['SAFRA'].isin(safras_val)]

X_train = df_train.drop(columns=['FPD'])
y_train = df_train['FPD']

X_val = df_val.drop(columns=['FPD'])
y_val = df_val['FPD']

# Train + Val combinado para treino final
X_train_final = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_train_final = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

# OOS e OOT
X_oos_agg = df_oos.drop(columns=['FPD'])
y_oos_agg = df_oos['FPD']

X_oot_agg = df_oot.drop(columns=['FPD'])
y_oot_agg = df_oot['FPD']

print(f'X_train: {X_train.shape}')
print(f'X_val:   {X_val.shape}')
print(f'X_train_final: {X_train_final.shape}')
print(f'X_oos: {X_oos_agg.shape}')
print(f'X_oot: {X_oot_agg.shape}')

## 7. Separacao de Variaveis por Tipo

In [None]:
# =============================================================================
# 7. VARIAVEIS NUMERICAS E CATEGORICAS
# =============================================================================
num_features = [
    n for n in X_train.select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns
    if n != 'SAFRA'
]
cat_features = [
    c for c in X_train.select_dtypes(include=['object', 'category']).columns
    if c != 'NUM_CPF'
]

print(f'Numericas: {len(num_features)}')
print(f'Categoricas: {len(cat_features)}')

## 8. Montagem dos Pipelines sklearn

In [None]:
# =============================================================================
# 8. PIPELINES
# =============================================================================

# --- Regressao Logistica ---
numeric_pipe_rl = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])
categorical_pipe_rl = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', CountEncoder(normalize=True)),
])
preprocess_rl = ColumnTransformer([
    ('num', numeric_pipe_rl, num_features),
    ('cat', categorical_pipe_rl, cat_features),
])
pipeline_RL = Pipeline([
    ('prep', preprocess_rl),
    ('model', LogisticRegression(
        solver='liblinear', penalty='l1',
        max_iter=2000, tol=1e-3,
        class_weight='balanced', random_state=42,
    )),
])

# --- LightGBM ---
numeric_pipe_lgbm = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
])
categorical_pipe_lgbm = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
])
preprocess_lgbm = ColumnTransformer([
    ('num', numeric_pipe_lgbm, num_features),
    ('cat', categorical_pipe_lgbm, cat_features),
], remainder='drop')
pipeline_LGBM = Pipeline([
    ('prep', preprocess_lgbm),
    ('model', LGBMClassifier(
        objective='binary', boosting_type='gbdt',
        learning_rate=0.05, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, verbosity=-1,
    )),
])

print('Pipelines criados: RL (prep) + LGBM (prep)')

## 9. Ajuste de Hiperparametros

In [None]:
# =============================================================================
# 9.1 GRID SEARCH - REGRESSAO LOGISTICA
# =============================================================================
param_grid_RL = {'model__C': [0.01, 0.05, 0.1, 0.5]}

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid_RL = GridSearchCV(
    pipeline_RL, param_grid=param_grid_RL,
    scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3,
)
grid_RL.fit(X_val, y_val)

print(f'Melhores HP RL: {grid_RL.best_params_}')
print(f'Melhor AUC RL:  {grid_RL.best_score_:.5f}')

In [None]:
# Treino final RL com best params (train + val)
pipeline_RL.set_params(**grid_RL.best_params_)
pipeline_RL.fit(X_train_final, y_train_final)
print('RL treinado com train+val')

In [None]:
# =============================================================================
# 9.2 GRID SEARCH - LIGHTGBM
# =============================================================================
param_grid_LGBM = {
    'model__n_estimators': [250, 500],
    'model__max_depth': [4, 7],
}

grid_LGBM = GridSearchCV(
    pipeline_LGBM, param_grid=param_grid_LGBM,
    scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3, error_score='raise',
)
grid_LGBM.fit(X_val, y_val)

print(f'Melhores HP LGBM: {grid_LGBM.best_params_}')
print(f'Melhor AUC LGBM:  {grid_LGBM.best_score_:.5f}')

In [None]:
# Treino final LGBM com best params (train + val)
pipeline_LGBM.set_params(**grid_LGBM.best_params_)
pipeline_LGBM.fit(X_train_final, y_train_final)
print('LGBM treinado com train+val')

## 10. Avaliacao por Safra (AUC e KS)

In [None]:
# =============================================================================
# 10. FUNCOES DE AVALIACAO
# =============================================================================

def ks_stat(y_true, y_score):
    """Calcula KS statistic."""
    df_ks = pd.DataFrame({'y': y_true.values, 'p': y_score})
    df_ks = df_ks.sort_values('p')
    df_ks['cum_good'] = (1 - df_ks['y']).cumsum() / (1 - df_ks['y']).sum()
    df_ks['cum_bad'] = df_ks['y'].cumsum() / df_ks['y'].sum()
    return np.max(np.abs(df_ks['cum_bad'] - df_ks['cum_good']))


def evaluation_auc_ks(X, y, pipe, name='', verbose=True):
    """Calcula AUC e KS para um dado split."""
    proba = pipe.predict_proba(X)[:, 1]
    auc = round(roc_auc_score(y, proba), 5)
    ks = round(ks_stat(y, proba), 5)
    if verbose:
        print(f'AVALIACAO {name}: AUC={auc}, KS={ks}')
        print('-' * 40)
    return auc, ks


def filter_xy_by_safra(X, y, list_safras):
    """Filtra X e y por lista de SAFRAs."""
    mask = X['SAFRA'].isin(list_safras)
    return X[mask], y.loc[X[mask].index]


def _sanitize_mlflow_key(key):
    """Sanitiza nome de metrica para MLflow Fabric.

    Fabric MLflow rejeita backslash, barras e outros caracteres especiais.
    Mantem apenas alfanumericos, underscore e hifen.
    Colapsa underscores consecutivos em um unico.
    """
    safe = re.sub(r'[^a-zA-Z0-9_\-]', '_', key)
    safe = re.sub(r'_+', '_', safe)
    return safe.strip('_')


# Mapa de splits para avaliacao
dict_safras = {
    'TREINO - 202410': [202410],
    'TREINO - 202411': [202411],
    'TREINO - 202412': [202412],
    'TREINO / VAL - 202501': [202501],
    'TREINO (CONS)': [202410, 202411, 202412, 202501],
    'OOS - 202410': [202410],
    'OOS - 202411': [202411],
    'OOS - 202412': [202412],
    'OOS - 202501': [202501],
    'OOS (CONS)': [202410, 202411, 202412, 202501],
    'OOT - 202502': [202502],
    'OOT - 202503': [202503],
    'OOT GERAL (CONS)': [202502, 202503],
}


def generate_map_step_data(X_train, y_train, X_oos, y_oos, X_oot, y_oot):
    """Gera mapa de datasets por split."""
    return {
        'TREINO - 202410': {'X': X_train, 'Y': y_train},
        'TREINO - 202411': {'X': X_train, 'Y': y_train},
        'TREINO - 202412': {'X': X_train, 'Y': y_train},
        'TREINO / VAL - 202501': {'X': X_train, 'Y': y_train},
        'TREINO (CONS)': {'X': X_train, 'Y': y_train},
        'OOS - 202410': {'X': X_oos, 'Y': y_oos},
        'OOS - 202411': {'X': X_oos, 'Y': y_oos},
        'OOS - 202412': {'X': X_oos, 'Y': y_oos},
        'OOS - 202501': {'X': X_oos, 'Y': y_oos},
        'OOS (CONS)': {'X': X_oos, 'Y': y_oos},
        'OOT - 202502': {'X': X_oot, 'Y': y_oot},
        'OOT - 202503': {'X': X_oot, 'Y': y_oot},
        'OOT GERAL (CONS)': {'X': X_oot, 'Y': y_oot},
    }


def log_safra_metrics_mlflow(model_name, dict_safras, X_train, y_train,
                              X_oos, y_oos, X_oot, y_oot, pipeline):
    """Loga metricas AUC e KS por safra no MLflow."""
    results = {}
    for key in dict_safras:
        map_data = generate_map_step_data(X_train, y_train, X_oos, y_oos, X_oot, y_oot)
        X = map_data[key]['X']
        y = map_data[key]['Y']
        X_f, y_f = filter_xy_by_safra(X, y, dict_safras[key])
        auc, ks = evaluation_auc_ks(X_f, y_f, pipeline, key)
        safe_key = _sanitize_mlflow_key(key)
        mlflow.log_metric(f'{model_name}_AUC_{safe_key}', auc)
        mlflow.log_metric(f'{model_name}_KS_{safe_key}', ks)
        results[key] = {'AUC': auc, 'KS': ks}
    return results

print('Funcoes de avaliacao carregadas')

In [None]:
# =============================================================================
# 10.1 AVALIACAO - REGRESSAO LOGISTICA
# =============================================================================
with mlflow.start_run(run_name='LogisticRegression_Baseline') as run_rl:
    params_rl = pipeline_RL.named_steps['model'].get_params()
    mlflow.log_param('model_type', 'LogisticRegression')
    mlflow.log_param('penalty', params_rl.get('penalty', 'l1'))
    mlflow.log_param('solver', params_rl.get('solver'))
    mlflow.log_param('C', params_rl.get('C'))
    mlflow.log_param('max_iter', params_rl.get('max_iter'))
    mlflow.log_param('n_features', len(X_train_final.columns))
    mlflow.log_param('n_samples_train', len(X_train_final))

    print('Avaliacao RL por base:')
    results_rl = log_safra_metrics_mlflow(
        'RL', dict_safras, X_train_final, y_train_final,
        X_oos_agg, y_oos_agg, X_oot_agg, y_oot_agg, pipeline_RL,
    )

    mlflow.sklearn.log_model(pipeline_RL, 'model_logistic_regression')

    coefs = pipeline_RL.named_steps['model'].coef_[0]
    feat_names = pipeline_RL.named_steps['prep'].get_feature_names_out()
    df_coefs = pd.DataFrame({'feature': feat_names, 'coef': coefs}).sort_values('coef', key=abs, ascending=False)
    df_coefs.to_csv('/tmp/lr_coefficients.csv', index=False)
    mlflow.log_artifact('/tmp/lr_coefficients.csv', 'feature_analysis')

    print(f'\nMLflow Run ID (RL): {run_rl.info.run_id}')

In [None]:
# =============================================================================
# 10.2 AVALIACAO - LIGHTGBM
# =============================================================================
with mlflow.start_run(run_name='LightGBM_Baseline') as run_lgbm:
    params_lgbm = pipeline_LGBM.named_steps['model'].get_params()
    mlflow.log_param('model_type', 'LightGBM')
    mlflow.log_param('n_estimators', params_lgbm.get('n_estimators'))
    mlflow.log_param('max_depth', params_lgbm.get('max_depth'))
    mlflow.log_param('learning_rate', params_lgbm.get('learning_rate'))
    mlflow.log_param('num_leaves', params_lgbm.get('num_leaves'))
    mlflow.log_param('boosting_type', params_lgbm.get('boosting_type', 'gbdt'))
    mlflow.log_param('n_features', len(X_train_final.columns))
    mlflow.log_param('n_samples_train', len(X_train_final))

    print('Avaliacao LGBM por base:')
    results_lgbm = log_safra_metrics_mlflow(
        'LGBM', dict_safras, X_train_final, y_train_final,
        X_oos_agg, y_oos_agg, X_oot_agg, y_oot_agg, pipeline_LGBM,
    )

    mlflow.sklearn.log_model(pipeline_LGBM, 'model_lightgbm')

    lgbm_model = pipeline_LGBM.named_steps['model']
    feat_names_lgbm = pipeline_LGBM.named_steps['prep'].get_feature_names_out()
    df_importance = pd.DataFrame({
        'feature': feat_names_lgbm,
        'importance': lgbm_model.feature_importances_,
    }).sort_values('importance', ascending=False)
    df_importance.to_csv('/tmp/lgbm_feature_importance.csv', index=False)
    mlflow.log_artifact('/tmp/lgbm_feature_importance.csv', 'feature_analysis')

    fig, ax = plt.subplots(figsize=(10, 8))
    top_30 = df_importance.head(30)
    ax.barh(range(len(top_30)), top_30['importance'].values)
    ax.set_yticks(range(len(top_30)))
    ax.set_yticklabels(top_30['feature'].values, fontsize=8)
    ax.set_xlabel('Feature Importance')
    ax.set_title('Top 30 Features - LightGBM')
    ax.invert_yaxis()
    plt.tight_layout()
    fig.savefig('/tmp/lgbm_feature_importance.png', dpi=150)
    mlflow.log_artifact('/tmp/lgbm_feature_importance.png', 'plots')
    plt.show()

    print(f'\nMLflow Run ID (LGBM): {run_lgbm.info.run_id}')

## 11. Analise Incremental de KS por Fonte

In [None]:
# =============================================================================
# 11. FUNCAO AUXILIAR: UPDATE PIPELINE + VAR CLASSIFICATION
# =============================================================================

def _var_num(col):
    """Extrai numero de colunas var_XX para comparacao numerica."""
    m = re.search(r'var_(\d+)', col)
    return int(m.group(1)) if m else -1


def update_pipeline(X, name_model):
    """Reconstroi pipeline com features atuais."""
    nf = [n for n in X.select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns if n != 'SAFRA']
    cf = [c for c in X.select_dtypes(include=['object', 'category']).columns if c != 'NUM_CPF']

    if name_model == 'Reg Log':
        num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
        cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', CountEncoder(normalize=True))])
        prep = ColumnTransformer([('num', num_pipe, nf), ('cat', cat_pipe, cf)])
        model = LogisticRegression(
            solver='liblinear', penalty='l1', max_iter=2000,
            C=grid_RL.best_params_['model__C'],
            tol=1e-3, class_weight='balanced', random_state=42,
        )
    else:
        num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
        cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                             ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0))])
        prep = ColumnTransformer([('num', num_pipe, nf), ('cat', cat_pipe, cf)], remainder='drop')
        model = LGBMClassifier(
            objective='binary', boosting_type='gbdt', learning_rate=0.05,
            max_depth=grid_LGBM.best_params_['model__max_depth'],
            n_estimators=grid_LGBM.best_params_['model__n_estimators'],
            colsample_bytree=0.8, random_state=42, n_jobs=-1, verbosity=-1,
        )

    return Pipeline([('prep', prep), ('model', model)])


def current_step(step_num):
    """Nome do incremento de features."""
    return {
        0: 'SC 1', 1: 'SC 1 + SC 2', 2: 'SC 1 + SC 2 + CAD',
        3: 'SC 1 + SC 2 + CAD + TELCO', 4: 'SC 1 + SC 2 + CAD + TELCO + REC',
        5: 'SC 1 + SC 2 + CAD + TELCO + REC + PAG',
        6: 'SC 1 + SC 2 + CAD + TELCO + REC + PAG + FAT',
    }[step_num]

print('Funcoes auxiliares carregadas')

In [None]:
# =============================================================================
# 11.1 KS INCREMENTAL POR FONTE
# =============================================================================
feat_score_1 = ['TARGET_SCORE_01']
feat_score_2 = ['TARGET_SCORE_02']
feats_cadastro = [x for x in X_train_final.columns if 'var_' in x and _var_num(x) <= 25] + ['STATUSRF', 'UF', 'REGIAO', 'DIAS_VAR_12']
feats_telco = [x for x in X_train_final.columns if 'var_' in x and _var_num(x) >= 26]
feats_recargas = [x for x in X_train_final.columns if x.startswith('REC_')]
feats_pagamentos = [x for x in X_train_final.columns if x.startswith('PAG_')]
feats_faturamento = [x for x in X_train_final.columns if x.startswith('FAT_')]

list_sources = [feat_score_1, feat_score_2, feats_cadastro, feats_telco, feats_recargas, feats_pagamentos, feats_faturamento]
list_features = ['NUM_CPF', 'SAFRA']
list_dict_results = []
list_models = ['Reg Log', 'LGBM']

for idx, source in enumerate(list_sources):
    list_features.extend(source)

    X_tr_filt = X_train_final[list_features]
    X_oos_filt = X_oos_agg[list_features]
    X_oot_filt = X_oot_agg[list_features]

    for model in list_models:
        pipe = update_pipeline(X_tr_filt, name_model=model)
        pipe.fit(X_tr_filt, y_train_final)

        for key in dict_safras:
            map_data = generate_map_step_data(
                X_tr_filt, y_train_final,
                X_oos_filt, y_oos_agg,
                X_oot_filt, y_oot_agg,
            )
            X_f, y_f = filter_xy_by_safra(map_data[key]['X'], map_data[key]['Y'], dict_safras[key])
            auc, ks = evaluation_auc_ks(X_f, y_f, pipe, key, verbose=False)
            list_dict_results.append({
                'MODELO': model, 'CONJ FEATURES': current_step(idx),
                'BASE': key, 'AUC': auc, 'KS': ks,
            })

df_results_ks_inc = pd.DataFrame(list_dict_results)

print('\n--- TREINO (CONS) ---')
display(df_results_ks_inc[df_results_ks_inc['BASE'] == 'TREINO (CONS)'])

print('\n--- OOT GERAL (CONS) ---')
display(df_results_ks_inc[df_results_ks_inc['BASE'] == 'OOT GERAL (CONS)'])

## 12. Feature Selection (SHAP TreeExplainer)

Abordagem baseada em SHAP — mede contribuicao real de cada feature
para a predicao de FPD, capturando interacoes que IV univariado nao detecta.

Pipeline:
1. Treinar LGBM com todas as features
2. Calcular SHAP values via TreeExplainer
3. Ranking global por mean(|SHAP|)
4. Selecionar features que acumulam 90% da importancia
5. Gerar visualizacoes por book para apresentacao

In [None]:
# =============================================================================
# 12.1 TREINAR LGBM PARA SHAP (TODAS FEATURES)
# =============================================================================
# Remove cadastro (baixo ganho historico) — manter score, telco, books
import re

def _var_num(col):
    m = re.search(r'var_(\d+)', col)
    return int(m.group(1)) if m else -1

feats_cadastro = [
    x for x in X_train_final.columns
    if 'var_' in x and _var_num(x) <= 25
] + ['STATUSRF', 'UF', 'REGIAO', 'DIAS_VAR_12']
feats_cadastro = [c for c in feats_cadastro if c in X_train_final.columns]

feats_to_use = [c for c in X_train_final.columns if c not in feats_cadastro
                and c not in ['NUM_CPF', 'SAFRA']]

X_shap = X_train_final[feats_to_use].copy()

num_shap = [n for n in X_shap.select_dtypes(include=['int32','int64','float32','float64']).columns]
cat_shap = [c for c in X_shap.select_dtypes(include=['object','category']).columns]

from sklearn.impute import SimpleImputer
from category_encoders import CountEncoder

num_pipe_shap = Pipeline([('imputer', SimpleImputer(strategy='median'))])
cat_pipe_shap = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
])
prep_shap = ColumnTransformer([
    ('num', num_pipe_shap, num_shap),
    ('cat', cat_pipe_shap, cat_shap),
], remainder='drop')

lgbm_shap = LGBMClassifier(
    objective='binary', boosting_type='gbdt',
    learning_rate=0.05, n_estimators=300, max_depth=7,
    colsample_bytree=0.8, subsample=0.8,
    random_state=42, n_jobs=-1, verbosity=-1,
)
pipe_shap = Pipeline([('prep', prep_shap), ('model', lgbm_shap)])
pipe_shap.fit(X_shap, y_train_final)

print(f'LGBM treinado para SHAP: {len(feats_to_use)} features ({len(num_shap)} num + {len(cat_shap)} cat)')
print(f'Cadastro removido: {len(feats_cadastro)} features')

In [None]:
# =============================================================================
# 12.2 CALCULAR SHAP VALUES (TreeExplainer)
# =============================================================================
X_shap_transformed = pipe_shap.named_steps['prep'].transform(X_shap)

try:
    transformed_names = pipe_shap.named_steps['prep'].get_feature_names_out()
except Exception:
    transformed_names = num_shap + cat_shap

explainer = shap.TreeExplainer(pipe_shap.named_steps['model'])
shap_values = explainer.shap_values(X_shap_transformed)

# Para classificacao binaria, pode ser lista [class_0, class_1]
if isinstance(shap_values, list):
    shap_vals = shap_values[1]  # FPD=1
else:
    shap_vals = shap_values

print(f'SHAP values: {shap_vals.shape}')

# Ranking global por mean(|SHAP|)
mean_abs_shap = np.abs(shap_vals).mean(axis=0)
df_shap_ranking = pd.DataFrame({
    'feature_transformed': list(transformed_names),
    'mean_abs_shap': mean_abs_shap,
}).sort_values('mean_abs_shap', ascending=False).reset_index(drop=True)

# Mapear para nome original
df_shap_ranking['feature'] = df_shap_ranking['feature_transformed'].apply(
    lambda x: x.split('__', 1)[-1] if '__' in x else x
)

def get_book_label(feat):
    if feat.startswith('REC_'): return 'Recarga (REC_)'
    elif feat.startswith('PAG_'): return 'Pagamento (PAG_)'
    elif feat.startswith('FAT_'): return 'Faturamento (FAT_)'
    return 'Base (Telco+Score)'

df_shap_ranking['book'] = df_shap_ranking['feature'].apply(get_book_label)
total_shap = df_shap_ranking['mean_abs_shap'].sum()
df_shap_ranking['pct_importance'] = df_shap_ranking['mean_abs_shap'] / total_shap
df_shap_ranking['cumulative_pct'] = df_shap_ranking['pct_importance'].cumsum()
df_shap_ranking['rank'] = range(1, len(df_shap_ranking) + 1)

print(f'\nTop 20 Features (SHAP):')
print(df_shap_ranking[['rank','feature','book','mean_abs_shap','pct_importance','cumulative_pct']].head(20).to_string(index=False))

# Contribuicao por book
print(f'\nContribuicao por Book:')
for book in ['Base (Telco+Score)', 'Recarga (REC_)', 'Pagamento (PAG_)', 'Faturamento (FAT_)']:
    pct = df_shap_ranking[df_shap_ranking['book'] == book]['mean_abs_shap'].sum() / total_shap * 100
    n = len(df_shap_ranking[df_shap_ranking['book'] == book])
    print(f'  {book}: {pct:.1f}% ({n} features)')

In [None]:
# =============================================================================
# 12.3 VISUALIZACOES SHAP PARA APRESENTACAO
# =============================================================================
BOOK_COLORS = {
    'Base (Telco+Score)': '#607D8B',
    'Recarga (REC_)': '#2196F3',
    'Pagamento (PAG_)': '#FF9800',
    'Faturamento (FAT_)': '#9C27B0',
}

def get_feature_color(feat):
    if feat.startswith('REC_') or '__REC_' in feat: return BOOK_COLORS['Recarga (REC_)']
    elif feat.startswith('PAG_') or '__PAG_' in feat: return BOOK_COLORS['Pagamento (PAG_)']
    elif feat.startswith('FAT_') or '__FAT_' in feat: return BOOK_COLORS['Faturamento (FAT_)']
    return BOOK_COLORS['Base (Telco+Score)']

book_names = ['Base (Telco+Score)', 'Recarga (REC_)', 'Pagamento (PAG_)', 'Faturamento (FAT_)']

# --- GRAFICO 1: SHAP Summary Beeswarm ---
fig, ax = plt.subplots(figsize=(12, 14))
shap.summary_plot(shap_vals, X_shap_transformed,
                  feature_names=list(transformed_names), max_display=40, show=False)
plt.title('SHAP Summary Plot - Top 40 Features (FPD)', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/tmp/shap_summary_beeswarm.png', dpi=150, bbox_inches='tight')
plt.show()

# --- GRAFICO 2: Top 30 colorido por Book ---
top30 = df_shap_ranking.head(30)
colors = [get_feature_color(f) for f in top30['feature']]
fig, ax = plt.subplots(figsize=(12, 10))
ax.barh(range(len(top30)-1, -1, -1), top30['mean_abs_shap'].values,
        color=colors, alpha=0.85, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(len(top30)-1, -1, -1))
ax.set_yticklabels(top30['feature'].values, fontsize=9)
ax.set_xlabel('mean(|SHAP value|)', fontsize=11)
ax.set_title('Top 30 Features por Importancia SHAP - Colorido por Book', fontsize=14, fontweight='bold')
for i, (_, row) in enumerate(top30.iterrows()):
    ax.text(row['mean_abs_shap'] + max(top30['mean_abs_shap'])*0.01,
            len(top30)-1-i, f"{row['pct_importance']:.1%}", va='center', fontsize=8, color='#333')
legend_elements = [Patch(facecolor=c, label=l) for l, c in BOOK_COLORS.items()]
ax.legend(handles=legend_elements, loc='lower right', fontsize=10)
ax.grid(True, alpha=0.2, axis='x')
plt.tight_layout()
plt.savefig('/tmp/shap_top30_by_book.png', dpi=150, bbox_inches='tight')
plt.show()
print('Graficos 1-2 salvos')

In [None]:
# =============================================================================
# 12.4 VISUALIZACOES POR BOOK (Top 15 cada)
# =============================================================================
# --- GRAFICO 3: Top 15 por Book (4 subplots) ---
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Top 15 Features por Book - Importancia SHAP', fontsize=16, fontweight='bold')

for idx, book_name in enumerate(book_names):
    ax = axes[idx // 2][idx % 2]
    book_data = df_shap_ranking[df_shap_ranking['book'] == book_name].head(15)
    if book_data.empty:
        ax.set_title(f'{book_name} - sem features')
        continue
    color = BOOK_COLORS[book_name]
    ax.barh(range(len(book_data)-1, -1, -1), book_data['mean_abs_shap'].values,
            color=color, alpha=0.85, edgecolor='white', linewidth=0.5)
    ax.set_yticks(range(len(book_data)-1, -1, -1))
    ax.set_yticklabels(book_data['feature'].values, fontsize=8)
    ax.set_xlabel('mean(|SHAP value|)', fontsize=9)
    total_book = df_shap_ranking[df_shap_ranking['book'] == book_name]['mean_abs_shap'].sum()
    pct_global = total_book / total_shap * 100
    n_total = len(df_shap_ranking[df_shap_ranking['book'] == book_name])
    ax.set_title(f'{book_name}\n{n_total} features | {pct_global:.1f}% importancia global',
                fontsize=11, fontweight='bold')
    ax.grid(True, alpha=0.2, axis='x')
    for i, (_, row) in enumerate(book_data.iterrows()):
        ax.text(row['mean_abs_shap'] + max(book_data['mean_abs_shap'])*0.02,
                len(book_data)-1-i, f"{row['mean_abs_shap']:.4f}", va='center', fontsize=7, color='#555')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig('/tmp/shap_top15_per_book.png', dpi=150, bbox_inches='tight')
plt.show()

# --- GRAFICO 4: Contribuicao por Book (Pie + Stacked) ---
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Contribuicao Agregada por Book', fontsize=14, fontweight='bold')
ax1 = axes[0]
book_pcts = []
book_labels = []
book_colors_list = []
for bn in book_names:
    tb = df_shap_ranking[df_shap_ranking['book'] == bn]['mean_abs_shap'].sum()
    pct = tb / total_shap * 100
    book_pcts.append(pct)
    n = len(df_shap_ranking[df_shap_ranking['book'] == bn])
    book_labels.append(f'{bn}\n({n} vars, {pct:.1f}%)')
    book_colors_list.append(BOOK_COLORS[bn])
ax1.pie(book_pcts, labels=None, colors=book_colors_list,
        autopct='%1.1f%%', startangle=90, pctdistance=0.7,
        textprops={'fontsize': 11, 'fontweight': 'bold'})
ax1.legend(book_labels, loc='center left', bbox_to_anchor=(-0.3, 0.5), fontsize=9)
ax1.set_title('% Importancia SHAP por Book', fontsize=12)
ax2 = axes[1]
bottom = np.zeros(1)
for bn in book_names:
    tb = df_shap_ranking[df_shap_ranking['book'] == bn]['mean_abs_shap'].sum()
    ax2.bar(0, tb, bottom=bottom, color=BOOK_COLORS[bn], label=bn, edgecolor='white', linewidth=0.5, width=0.5)
    mid = bottom[0] + tb / 2
    pct = tb / total_shap * 100
    if pct > 5:
        ax2.text(0, mid, f'{pct:.1f}%', ha='center', va='center', fontsize=11, fontweight='bold', color='white')
    bottom += tb
ax2.set_ylabel('Sum mean(|SHAP|)', fontsize=11)
ax2.set_title('Importancia SHAP Empilhada', fontsize=12)
ax2.set_xticks([0]); ax2.set_xticklabels(['Todas Features'])
ax2.legend(loc='upper right', fontsize=9); ax2.grid(True, alpha=0.2, axis='y')
plt.tight_layout()
plt.savefig('/tmp/shap_book_contribution.png', dpi=150, bbox_inches='tight')
plt.show()
print('Graficos 3-4 salvos')

In [None]:
# =============================================================================
# 12.5 SELECAO FINAL — PARETO 90% CUMULATIVO
# =============================================================================
CUMULATIVE_THRESHOLD = 0.90

# Grafico Pareto
fig, ax = plt.subplots(figsize=(14, 6))
x_range = range(1, len(df_shap_ranking) + 1)
colors_cum = [get_feature_color(f) for f in df_shap_ranking['feature']]
ax.bar(x_range, df_shap_ranking['pct_importance'].values, color=colors_cum, alpha=0.7, width=1.0)
ax2 = ax.twinx()
ax2.plot(x_range, df_shap_ranking['cumulative_pct'].values, color='red', linewidth=2)
ax2.axhline(y=CUMULATIVE_THRESHOLD, color='red', linestyle='--', alpha=0.5)
n_90 = (df_shap_ranking['cumulative_pct'] <= CUMULATIVE_THRESHOLD).sum()
ax2.axvline(x=n_90, color='green', linestyle='--', alpha=0.7)
ax2.annotate(f'{n_90} features\n= 90% importancia',
            xy=(n_90, 0.90), xytext=(n_90 + 20, 0.80),
            arrowprops=dict(arrowstyle='->', color='green'),
            fontsize=11, fontweight='bold', color='green')
ax.set_xlabel('Feature (rank SHAP)', fontsize=11)
ax.set_ylabel('Importancia Individual (%)', fontsize=11)
ax2.set_ylabel('Cumulativo (%)', fontsize=11)
ax.set_title(f'Curva de Pareto - {n_90} features capturam 90% da importancia SHAP',
            fontsize=14, fontweight='bold')
from matplotlib.lines import Line2D
legend_elems = [Patch(facecolor=c, label=l) for l, c in BOOK_COLORS.items()]
legend_elems.append(Line2D([0],[0], color='red', linewidth=2, label='Cumulativo'))
ax.legend(handles=legend_elems, loc='center right', fontsize=9)
plt.tight_layout()
plt.savefig('/tmp/shap_pareto_cumulative.png', dpi=150, bbox_inches='tight')
plt.show()

# Selecionar features
selected_mask = df_shap_ranking['cumulative_pct'] <= CUMULATIVE_THRESHOLD
if not selected_mask.all():
    first_over = selected_mask[~selected_mask].index[0]
    selected_mask.iloc[:first_over + 1] = True
df_selected = df_shap_ranking[selected_mask]
final_set_features = df_selected['feature'].unique().tolist()

print(f'Features selecionadas (SHAP >= {CUMULATIVE_THRESHOLD:.0%}): {len(final_set_features)}')
print(f'Importancia capturada: {df_selected["pct_importance"].sum():.1%}')
print(f'\nBreakdown por book:')
for bn in book_names:
    n_sel = len(df_selected[df_selected['book'] == bn])
    n_all = len(df_shap_ranking[df_shap_ranking['book'] == bn])
    print(f'  {bn}: {n_sel} / {n_all} selecionadas')

In [None]:
# =============================================================================
# 12.6 EXPORT SHAP ARTIFACTS
# =============================================================================
import os
shap_dir = '/tmp/shap_artifacts'
os.makedirs(shap_dir, exist_ok=True)

# Salvar ranking CSV
df_shap_ranking.to_csv(f'{shap_dir}/shap_feature_ranking.csv', index=False)

# Salvar lista selecionada como pickle
with open(f'{shap_dir}/selected_features_shap.pkl', 'wb') as f:
    pickle.dump(final_set_features, f)

# Log graficos no MLflow
with mlflow.start_run(run_name='SHAP_Feature_Selection'):
    mlflow.set_tag('task', 'feature_selection')
    mlflow.set_tag('method', 'SHAP_TreeExplainer')
    mlflow.log_param('n_features_total', len(feats_to_use))
    mlflow.log_param('n_features_selected', len(final_set_features))
    mlflow.log_param('cumulative_threshold', CUMULATIVE_THRESHOLD)
    for fig_path in ['/tmp/shap_summary_beeswarm.png', '/tmp/shap_top30_by_book.png',
                     '/tmp/shap_top15_per_book.png', '/tmp/shap_book_contribution.png',
                     '/tmp/shap_pareto_cumulative.png']:
        if os.path.exists(fig_path):
            mlflow.log_artifact(fig_path, 'shap_plots')
    mlflow.log_artifact(f'{shap_dir}/shap_feature_ranking.csv', 'feature_selection')
    mlflow.log_artifact(f'{shap_dir}/selected_features_shap.pkl', 'feature_selection')

print(f'Artifacts salvos em {shap_dir}')
print(f'Features finais para modelo: {len(final_set_features)}')
print(f'\nfinal_set_features pronto para Cell 13 (Modelo Final)')

## 13. Modelo Final com Features Selecionadas

In [None]:
# =============================================================================
# 13. TREINO E AVALIACAO FINAL + MLFLOW
# =============================================================================
dimensions = ['NUM_CPF', 'SAFRA']
final_features_with_dims = final_set_features + [d for d in dimensions if d not in final_set_features]

X_tr_final_fs = X_train_final[final_features_with_dims]
X_oos_fs = X_oos_agg[final_features_with_dims]
X_oot_fs = X_oot_agg[final_features_with_dims]

models = ['Reg Log', 'LGBM']
list_results_fs = []
best_model_name = None
best_model_pipeline = None
best_ks_oot = 0

for model_name in models:
    run_name = f"Final_{model_name.replace(' ', '')}_FeatureSelection"

    with mlflow.start_run(run_name=run_name) as run:
        pipe = update_pipeline(X_tr_final_fs, name_model=model_name)
        pipe.fit(X_tr_final_fs, y_train_final)

        mlflow.log_param('model_type', model_name)
        mlflow.log_param('n_features', len(final_set_features))
        mlflow.log_param('feature_selection', 'IV + L1_coefs + high_corr')
        model_params = pipe.named_steps['model'].get_params()
        for k, v in model_params.items():
            if isinstance(v, (int, float, str, bool)):
                mlflow.log_param(f'model__{k}', v)

        print(f"\n{'='*60}")
        print(f'MODELO FINAL: {model_name} ({len(final_set_features)} features)')
        print(f"{'='*60}")

        for key in dict_safras:
            map_data = generate_map_step_data(
                X_tr_final_fs, y_train_final,
                X_oos_fs, y_oos_agg,
                X_oot_fs, y_oot_agg,
            )
            X_f, y_f = filter_xy_by_safra(map_data[key]['X'], map_data[key]['Y'], dict_safras[key])
            auc, ks = evaluation_auc_ks(X_f, y_f, pipe, key, verbose=True)

            safe_key = _sanitize_mlflow_key(key)
            mlflow.log_metric(f'AUC_{safe_key}', auc)
            mlflow.log_metric(f'KS_{safe_key}', ks)
            list_results_fs.append({'MODEL': model_name, 'BASE': key, 'AUC': auc, 'KS': ks})

            if key == 'OOT GERAL (CONS)' and ks > best_ks_oot:
                best_ks_oot = ks
                best_model_name = model_name
                best_model_pipeline = pipe

        mlflow.sklearn.log_model(pipe, f"model_final_{model_name.replace(' ', '_').lower()}")
        print(f'\nMLflow Run ID ({model_name}): {run.info.run_id}')

df_results_feat_selection = pd.DataFrame(list_results_fs)

print(f"\n{'='*60}")
print(f'MELHOR MODELO (KS OOT): {best_model_name} -- KS={best_ks_oot:.5f}')
print(f'Benchmark KS: 33.1% (0.331)')
print(f"{'='*60}")

## 14. Swap Analysis

**Nota**: A swap analysis compara a estabilidade de ranking entre OOT1 e OOT2.
Como sao periodos diferentes com clientes distintos, a comparacao e feita por
**posicao ordinal** (top N% por score), nao por cliente individual.
O objetivo e verificar se o modelo produz distribuicoes de ranking similares
entre safras — swap% alto indica instabilidade temporal do modelo.

In [None]:
# =============================================================================
# 14. SWAP ANALYSIS (OOT1 vs OOT2)
# =============================================================================

def swap_analysis(df_ref, df_new, score_col='score', target_col='FPD', top_pct=0.1):
    n = int(len(df_ref) * top_pct)
    if n == 0:
        return {'swap_in_%': 0, 'swap_out_%': 0, 'n_top': 0}
    ref_top = df_ref.nlargest(n, score_col)
    new_top = df_new.nlargest(n, score_col)
    swap_in = len(set(new_top.index) - set(ref_top.index))
    swap_out = len(set(ref_top.index) - set(new_top.index))
    return {
        'swap_in_%': round(swap_in / n * 100, 2),
        'swap_out_%': round(swap_out / n * 100, 2),
        'n_top': n,
        'default_rate_ref': round(ref_top[target_col].mean(), 4) if target_col in ref_top.columns else None,
        'default_rate_new': round(new_top[target_col].mean(), 4) if target_col in new_top.columns else None,
    }


print(f'Modelo utilizado: {best_model_name}')
print('=' * 60)

X_oot_feat = X_oot_fs.copy()
scores_oot = best_model_pipeline.predict_proba(X_oot_feat)[:, 1]
df_swap = X_oot_feat[['SAFRA']].copy()
df_swap['score'] = scores_oot
df_swap['FPD'] = y_oot_agg.values

safras_oot_list = sorted(df_swap['SAFRA'].unique())
print(f'Safras OOT: {safras_oot_list}')

if len(safras_oot_list) >= 2:
    for top_pct in [0.05, 0.10, 0.20, 0.30]:
        df_ref = df_swap[df_swap['SAFRA'] == safras_oot_list[0]].reset_index(drop=True)
        df_new = df_swap[df_swap['SAFRA'] == safras_oot_list[1]].reset_index(drop=True)
        swap = swap_analysis(df_ref, df_new, top_pct=top_pct)
        print(f"\nTop {top_pct:.0%} (n={swap['n_top']}):")
        print(f"  Swap-in:  {swap['swap_in_%']:.1f}%")
        print(f"  Swap-out: {swap['swap_out_%']:.1f}%")
        print(f"  Default Rate OOT1 ({safras_oot_list[0]}): {swap['default_rate_ref']}")
        print(f"  Default Rate OOT2 ({safras_oot_list[1]}): {swap['default_rate_new']}")

## 15. Visualizacoes Finais

In [None]:
# =============================================================================
# 15. VISUALIZACOES (KS Curve, Score Dist, Confusion Matrix, KS por Safra)
# =============================================================================
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

ax1 = axes[0, 0]
y_oot_vis = y_oot_agg.values
scores_vis = best_model_pipeline.predict_proba(X_oot_fs)[:, 1]
df_ks = pd.DataFrame({'y': y_oot_vis, 'score': scores_vis}).sort_values('score')
df_ks['cum_good'] = (1 - df_ks['y']).cumsum() / (1 - df_ks['y']).sum()
df_ks['cum_bad'] = df_ks['y'].cumsum() / df_ks['y'].sum()
df_ks['ks_diff'] = np.abs(df_ks['cum_bad'] - df_ks['cum_good'])
ks_max_idx = df_ks['ks_diff'].idxmax()
ks_max_val = df_ks.loc[ks_max_idx, 'ks_diff']

x_axis = np.linspace(0, 1, len(df_ks))
ax1.plot(x_axis, df_ks['cum_good'].values, label='Bons (FPD=0)', color='blue')
ax1.plot(x_axis, df_ks['cum_bad'].values, label='Maus (FPD=1)', color='red')
ax1.axvline(x=x_axis[df_ks.index.get_loc(ks_max_idx)], color='green', linestyle='--', alpha=0.7)
ax1.set_title(f'KS Curve - OOT ({best_model_name}) | KS = {ks_max_val:.4f}')
ax1.set_xlabel('Populacao (%)')
ax1.set_ylabel('CDF')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[0, 1]
ax2.hist(scores_vis[y_oot_vis == 0], bins=50, alpha=0.6, label='Bons (FPD=0)', color='blue', density=True)
ax2.hist(scores_vis[y_oot_vis == 1], bins=50, alpha=0.6, label='Maus (FPD=1)', color='red', density=True)
ax2.set_title(f'Distribuicao de Scores - OOT ({best_model_name})')
ax2.set_xlabel('Score (Prob FPD)')
ax2.set_ylabel('Densidade')
ax2.legend()
ax2.grid(True, alpha=0.3)

ax3 = axes[1, 0]
y_pred = (scores_vis >= 0.5).astype(int)
cm = confusion_matrix(y_oot_vis, y_pred)
ax3.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax3.set_title(f'Confusion Matrix - OOT ({best_model_name})')
ax3.set_ylabel('Real')
ax3.set_xlabel('Predito')
ax3.set_xticks([0, 1])
ax3.set_yticks([0, 1])
ax3.set_xticklabels(['Bom (0)', 'Mau (1)'])
ax3.set_yticklabels(['Bom (0)', 'Mau (1)'])
for i in range(2):
    for j in range(2):
        ax3.text(j, i, f'{cm[i, j]:,}', ha='center', va='center',
                color='white' if cm[i, j] > cm.max() / 2 else 'black', fontsize=12)

ax4 = axes[1, 1]
df_oot_res = df_results_feat_selection[
    (df_results_feat_selection['MODEL'] == best_model_name) &
    (df_results_feat_selection['BASE'].str.contains('OOT|OOS'))
]
if not df_oot_res.empty:
    bars = ax4.bar(range(len(df_oot_res)), df_oot_res['KS'].values, color='steelblue')
    ax4.set_xticks(range(len(df_oot_res)))
    ax4.set_xticklabels(df_oot_res['BASE'].values, rotation=45, ha='right', fontsize=8)
    ax4.axhline(y=0.331, color='red', linestyle='--', label='Benchmark KS = 33.1%')
    ax4.set_title(f'KS por Base - {best_model_name}')
    ax4.set_ylabel('KS')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    for bar, val in zip(bars, df_oot_res['KS'].values):
        ax4.text(bar.get_x() + bar.get_width() / 2, val + 0.005, f'{val:.3f}', ha='center', fontsize=8)

plt.tight_layout()
fig.savefig('/tmp/final_model_visualizations.png', dpi=150, bbox_inches='tight')

with mlflow.start_run(run_name='Final_Visualizations'):
    mlflow.log_artifact('/tmp/final_model_visualizations.png', 'plots')

plt.show()

print(f'\nModelo final: {best_model_name}')
print(f'KS OOT: {best_ks_oot:.5f}')
print(f'Benchmark: 0.331')

## 16. Export do Modelo

In [None]:
# =============================================================================
# 16. EXPORT DO MODELO PARA MLFLOW REGISTRY
# =============================================================================
sys.path.insert(0, '/lakehouse/default/Files/projeto-final/5-treinamento-modelos')
from export_model import export_model, promote_to_production

feature_names_export = [f for f in final_set_features if f not in ['NUM_CPF', 'SAFRA']]

X_oot_exp = X_oot_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore')
X_oos_exp = X_oos_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore')

print(f'Modelo a exportar: {best_model_name}')
print(f'Features: {len(feature_names_export)}')
print(f'KS OOT (mesmo pipeline da selecao): {best_ks_oot:.5f}')
print('=' * 60)

# --- LGBM (principal) ---
if best_model_name == 'LGBM':
    pipe_lgbm_export = best_model_pipeline
else:
    pipe_lgbm_export = update_pipeline(
        X_tr_final_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore'), name_model='LGBM'
    )
    pipe_lgbm_export.fit(
        X_tr_final_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore'), y_train_final
    )

ks_oot_lgbm = ks_stat(y_oot_agg, pipe_lgbm_export.predict_proba(X_oot_fs)[:, 1])
auc_oot_lgbm = roc_auc_score(y_oot_agg, pipe_lgbm_export.predict_proba(X_oot_fs)[:, 1])
ks_oos_lgbm = ks_stat(y_oos_agg, pipe_lgbm_export.predict_proba(X_oos_fs)[:, 1])
auc_oos_lgbm = roc_auc_score(y_oos_agg, pipe_lgbm_export.predict_proba(X_oos_fs)[:, 1])

ks_oos_202501 = None
auc_oos_202501 = None
X_oos_501, y_oos_501 = filter_xy_by_safra(X_oos_fs, y_oos_agg, [202501])
if len(y_oos_501) > 0:
    proba_501 = pipe_lgbm_export.predict_proba(X_oos_501)[:, 1]
    ks_oos_202501 = ks_stat(y_oos_501, proba_501)
    auc_oos_202501 = roc_auc_score(y_oos_501, proba_501)

metrics_lgbm = {
    'ks_oot': ks_oot_lgbm, 'auc_oot': auc_oot_lgbm,
    'ks_oos': ks_oos_lgbm, 'auc_oos': auc_oos_lgbm,
    'gini_oot': (2 * auc_oot_lgbm - 1) * 100,
    'gini_oos': (2 * auc_oos_lgbm - 1) * 100,
}
if ks_oos_202501 is not None:
    metrics_lgbm['ks_oos_202501'] = ks_oos_202501
    metrics_lgbm['auc_oos_202501'] = auc_oos_202501
    metrics_lgbm['gini_oos_202501'] = (2 * auc_oos_202501 - 1) * 100

result_lgbm = export_model(
    pipeline=pipe_lgbm_export, model_name='lgbm_baseline',
    X_test=X_oot_exp, y_test=y_oot_agg,
    feature_names=feature_names_export,
    metrics_dict=metrics_lgbm,
)
print(f"\nLGBM exportado: {result_lgbm['registered_name']}")
print(f"  MLflow Run ID: {result_lgbm['mlflow_run_id']}")
print(f"  PKL: {result_lgbm['pkl_path']}")
print(f"  KS OOT: {ks_oot_lgbm:.5f}")
print(f"  KS OOS: {ks_oos_lgbm:.5f}")
if ks_oos_202501 is not None:
    print(f"  KS OOS 202501: {ks_oos_202501:.5f}")

# --- LR (benchmark) ---
if best_model_name == 'Reg Log':
    pipe_lr_export = best_model_pipeline
else:
    pipe_lr_export = update_pipeline(
        X_tr_final_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore'), name_model='Reg Log'
    )
    pipe_lr_export.fit(
        X_tr_final_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore'), y_train_final
    )

ks_oot_lr = ks_stat(y_oot_agg, pipe_lr_export.predict_proba(X_oot_fs)[:, 1])
auc_oot_lr = roc_auc_score(y_oot_agg, pipe_lr_export.predict_proba(X_oot_fs)[:, 1])

result_lr = export_model(
    pipeline=pipe_lr_export, model_name='logistic_regression_l1',
    X_test=X_oot_exp, y_test=y_oot_agg,
    feature_names=feature_names_export,
    metrics_dict={
        'ks_oot': ks_oot_lr, 'auc_oot': auc_oot_lr,
        'gini_oot': (2 * auc_oot_lr - 1) * 100,
    },
)
print(f"\nLR exportado: {result_lr['registered_name']}")
print(f"  MLflow Run ID: {result_lr['mlflow_run_id']}")

print(f"\n{'='*60}")
print('Export concluido! Modelos registrados no MLflow em Staging.')
print('Proximo passo: executar scoring_batch e validacao_deploy')