In [None]:
# Instalacao de pacote para uso de Target/CountEncoder
!pip install category-encoders==2.6.3

In [None]:
%%configure

{
    "conf": {
        "spark.driver.maxResultSize": "8g",
        "spark.driver.memory": "54g",
        "spark.driver.cores": 8,
        "spark.executor.instances": 0,
        "spark.sql.execution.arrow.pyspark.enabled": "true",
        "spark.sql.execution.arrow.pyspark.selfDestruct.enabled": "true"
    }
}

# Modelo Baseline de Risco - Telecom (v5 - LGBM + SHAP + Swap Analysis Corrigido)

Este notebook implementa o **modelo otimizado de risco de inadimplencia (FPD)** para clientes Claro.

### Evolucao
- **v1**: PySpark para limpeza + Pandas para modelagem (original)
- **v2**: Python puro com `deltalake` (delta-rs) - falha no Fabric
- **v3**: Spark read + Pandas modeling, RL + LGBM + KS Incremental + SHAP
- **v4**: Versao otimizada - apenas LGBM, SHAP feature selection, sem KS Incremental
- **v5 (este)**: Swap analysis corrigido (modelo vs FPD real) + visualizacoes avancadas

### Correcao Swap Analysis (v5 vs v4)
- **v4 (bug)**: Comparava populacoes DIFERENTES (OOT1 vs OOT2) por indice de linha → swap-in = swap-out sempre
- **v5 (correto)**: Compara ranking do MODELO vs FPD REAL na MESMA populacao
  - **Swap-in**: Clientes no top X% do modelo que NAO sao maus reais (falsos alarmes)
  - **Swap-out**: Maus reais que o modelo NAO captura no top X% (escapados)
  - **Overlap**: Acertos — maus reais capturados pelo modelo

### Principais etapas
1. Leitura do Gold Feature Store via `spark.read.format("delta")`
2. Limpeza de dados (missing, correlacao, leakage)
3. Split temporal: Treino (202410-202412), Val (202501), OOS (75%), OOT (202502-202503)
4. Amostragem estratificada 25% por (SAFRA, FPD)
5. GridSearch LGBM + Treino final
6. Avaliacao baseline (AUC, KS por safra)
7. Feature selection SHAP TreeExplainer (90% cumulative)
8. Modelo final LGBM com features selecionadas
9. **Swap analysis correto (modelo vs FPD real)**
10. **Analise por decis + Capture Rate + PSI**
11. **16 visualizacoes avancadas (salvas em lakehouse + MLflow)**
12. Export via MLflow Registry

In [None]:
# =============================================================================
# 1. IMPORTS E CONFIGURACAO
# =============================================================================
import pandas as pd
import numpy as np
import re
import gc
import os
import shap
import pickle
import mlflow
import mlflow.sklearn
import matplotlib
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, IntegerType, DoubleType, LongType

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve
from lightgbm import LGBMClassifier
from category_encoders import CountEncoder

warnings.filterwarnings('ignore')

# FIX: sklearn >= 1.6 renamed force_all_finite -> ensure_all_finite
# LightGBM sklearn wrapper still uses old name, causing TypeError on predict_proba
import lightgbm.sklearn as _lgbm_sklearn
_orig_check = _lgbm_sklearn._LGBMCheckArray
def _patched_lgbm_check(*args, **kwargs):
    kwargs.pop('force_all_finite', None)
    kwargs.pop('ensure_all_finite', None)
    return _orig_check(*args, **kwargs)
_lgbm_sklearn._LGBMCheckArray = _patched_lgbm_check

# Config centralizado do pipeline
import sys; sys.path.insert(0, '/lakehouse/default/Files/projeto-final')
from config.pipeline_config import (
    PATH_FEATURE_STORE, EXPERIMENT_NAME, SAFRAS,
    LEAKAGE_BLACKLIST, TARGET_COLUMNS
)

# ---- CONSTANTES V5 ----
OUTPUT_DIR_V5 = "/lakehouse/default/Files/projeto-final/docs/analytics/v5"
os.makedirs(OUTPUT_DIR_V5, exist_ok=True)
DPI = 150
SWAP_CUTOFFS = [0.05, 0.10, 0.20, 0.30]

COLORS = {
    "blue": "#2196F3", "orange": "#FF9800", "green": "#4CAF50",
    "red": "#F44336", "purple": "#9C27B0", "gray": "#607D8B"
}
sns.set_style("whitegrid")
sns.set_palette([COLORS["blue"], COLORS["orange"], COLORS["green"], COLORS["red"]])

print('Imports OK — v5 (LGBM + SHAP + Swap Corrigido + check_array patched)')

In [None]:
# =============================================================================
# 2. MLFLOW SETUP
# =============================================================================
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog(disable=True)  # Controle manual para evitar conflitos

print(f'MLflow experiment: {EXPERIMENT_NAME}')
print(f'Tracking URI: {mlflow.get_tracking_uri()}')

In [None]:
# =============================================================================
# 3. LEITURA OTIMIZADA DO GOLD FEATURE STORE (Spark -> Pandas)
# =============================================================================
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "true")

print(f'Lendo feature store de: {PATH_FEATURE_STORE}\n')

df_spark = spark.read.format("delta").load(PATH_FEATURE_STORE)
n_original = df_spark.count()
print(f'Original: {n_original:,} rows x {len(df_spark.columns)} cols')

# Drop colunas audit + leakage no Spark
cols_audit = ['_execution_id', '_data_inclusao', '_data_alteracao_silver', 'DT_PROCESSAMENTO']
cols_drop = [c for c in cols_audit + LEAKAGE_BLACKLIST if c in df_spark.columns]
if cols_drop:
    df_spark = df_spark.drop(*cols_drop)
    print(f'Drop {len(cols_drop)} colunas (audit+leakage): {cols_drop}')

# Filtrar FLAG_INSTALACAO == 1 no Spark
n_reprovados = 0
if 'FLAG_INSTALACAO' in df_spark.columns:
    n_reprovados = df_spark.filter(F.col('FLAG_INSTALACAO') == 0).count()
    df_spark = df_spark.filter(F.col('FLAG_INSTALACAO') == 1).drop('FLAG_INSTALACAO')
    n_pos = n_original - n_reprovados
    print(f'FLAG_INSTALACAO: {n_original:,} -> {n_pos:,} ({n_reprovados:,} reprovados removidos)')
else:
    n_pos = n_original

# Cast tipos via .select() (plano flat)
cast_exprs = []
n_double, n_long = 0, 0
for field in df_spark.schema.fields:
    if isinstance(field.dataType, DoubleType):
        cast_exprs.append(F.col(field.name).cast(FloatType()).alias(field.name))
        n_double += 1
    elif isinstance(field.dataType, LongType):
        cast_exprs.append(F.col(field.name).cast(IntegerType()).alias(field.name))
        n_long += 1
    else:
        cast_exprs.append(F.col(field.name))
df_spark = df_spark.select(*cast_exprs)
print(f'Cast tipos: {n_double} Double->Float, {n_long} Long->Int')

# Conversao chunked por SAFRA
safras_disponiveis = sorted([row.SAFRA for row in df_spark.select('SAFRA').distinct().collect()])
print(f'\nSAFRAs: {safras_disponiveis} | Colunas: {len(df_spark.columns)}')
print('Convertendo por SAFRA...')

chunks = []
for safra in safras_disponiveis:
    chunk = df_spark.filter(F.col('SAFRA') == safra).toPandas()
    mem_mb = chunk.memory_usage(deep=True).sum() / 1e6
    print(f'  SAFRA {safra}: {len(chunk):,} rows | {mem_mb:.0f} MB')
    chunks.append(chunk)
    gc.collect()

df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f'\nDataset carregado:')
print(f'  Shape: {df.shape}')
print(f'  Memory: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB')

## 4. Limpeza de Dados (Pandas puro)

In [None]:
# =============================================================================
# 4.1 FUNCOES DE LIMPEZA
# =============================================================================

def clean_empty_keys(df):
    return df.dropna(subset=['NUM_CPF', 'SAFRA'])

def convert_cep3_uf_regiao(df):
    cep_map = {
        '01':('SP','SUDESTE'),'02':('SP','SUDESTE'),'03':('SP','SUDESTE'),
        '04':('SP','SUDESTE'),'05':('SP','SUDESTE'),'06':('SP','SUDESTE'),
        '07':('SP','SUDESTE'),'08':('SP','SUDESTE'),'09':('SP','SUDESTE'),
        '20':('RJ','SUDESTE'),'21':('RJ','SUDESTE'),'22':('RJ','SUDESTE'),
        '23':('RJ','SUDESTE'),'24':('RJ','SUDESTE'),'29':('ES','SUDESTE'),
        '30':('MG','SUDESTE'),'31':('MG','SUDESTE'),'32':('MG','SUDESTE'),
        '33':('MG','SUDESTE'),'34':('MG','SUDESTE'),'35':('MG','SUDESTE'),
        '36':('MG','SUDESTE'),'37':('MG','SUDESTE'),'38':('MG','SUDESTE'),
        '39':('MG','SUDESTE'),
        '40':('BA','NORDESTE'),'41':('BA','NORDESTE'),'42':('BA','NORDESTE'),
        '43':('BA','NORDESTE'),'44':('BA','NORDESTE'),'45':('BA','NORDESTE'),
        '46':('BA','NORDESTE'),'47':('BA','NORDESTE'),'48':('BA','NORDESTE'),
        '49':('SE','NORDESTE'),
        '50':('PE','NORDESTE'),'51':('PE','NORDESTE'),'52':('PE','NORDESTE'),
        '53':('PE','NORDESTE'),'54':('PE','NORDESTE'),'55':('PE','NORDESTE'),
        '56':('AL','NORDESTE'),'57':('AL','NORDESTE'),
        '58':('PB','NORDESTE'),'59':('RN','NORDESTE'),
        '60':('CE','NORDESTE'),'61':('CE','NORDESTE'),'62':('CE','NORDESTE'),
        '63':('PI','NORDESTE'),'64':('PI','NORDESTE'),'65':('MA','NORDESTE'),
        '66':('PA','NORTE'),'67':('PA','NORTE'),'68':('AC','NORTE'),
        '69':('AM','NORTE'),'77':('TO','NORTE'),
        '70':('DF','CENTRO-OESTE'),'71':('DF','CENTRO-OESTE'),
        '72':('GO','CENTRO-OESTE'),'73':('GO','CENTRO-OESTE'),
        '74':('GO','CENTRO-OESTE'),'75':('GO','CENTRO-OESTE'),
        '76':('GO','CENTRO-OESTE'),
        '78':('MT','CENTRO-OESTE'),'79':('MS','CENTRO-OESTE'),
        '80':('PR','SUL'),'81':('PR','SUL'),'82':('PR','SUL'),
        '83':('PR','SUL'),'84':('PR','SUL'),'85':('PR','SUL'),
        '86':('PR','SUL'),'87':('PR','SUL'),
        '88':('SC','SUL'),'89':('SC','SUL'),
        '90':('RS','SUL'),'91':('RS','SUL'),'92':('RS','SUL'),
        '93':('RS','SUL'),'94':('RS','SUL'),'95':('RS','SUL'),
        '96':('RS','SUL'),'97':('RS','SUL'),'98':('RS','SUL'),'99':('RS','SUL'),
    }
    if 'CEP_3_digitos' not in df.columns:
        return df
    cep2 = df['CEP_3_digitos'].astype(str).str[:2]
    mapped = cep2.map(cep_map)
    df['UF'] = mapped.apply(lambda x: x[0] if isinstance(x, tuple) else 'OUTROS')
    df['REGIAO'] = mapped.apply(lambda x: x[1] if isinstance(x, tuple) else 'OUTROS')
    return df.drop(columns=['CEP_3_digitos'])

def adjust_and_drop_date_cols(df):
    if 'var_12' in df.columns:
        df['var_12'] = pd.to_datetime(df['var_12'], format='%d/%m/%Y', errors='coerce')
    df['DATA_REF_SAFRA'] = pd.to_datetime(df['SAFRA'].astype(str), format='%Y%m')
    if 'var_12' in df.columns:
        df['DIAS_VAR_12'] = (df['DATA_REF_SAFRA'] - df['var_12']).dt.days
    if 'PAG_DT_PRIMEIRA_FATURA' in df.columns:
        df['PAG_DT_PRIMEIRA_FATURA'] = pd.to_datetime(df['PAG_DT_PRIMEIRA_FATURA'], errors='coerce')
        df['PAG_DIAS_DESDE_PRIMEIRA_FATURA'] = (df['DATA_REF_SAFRA'] - df['PAG_DT_PRIMEIRA_FATURA']).dt.days
    date_cols = df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
    date_cols.append('DATA_REF_SAFRA')
    return df.drop(columns=[c for c in date_cols if c in df.columns])

def remove_high_missing(df, threshold=0.75):
    null_pct = df.isnull().mean()
    cols_to_drop = null_pct[null_pct >= threshold].index.tolist()
    print(f'  High missing (>= {threshold:.0%}): {len(cols_to_drop)} colunas removidas')
    return df.drop(columns=cols_to_drop)

def remove_low_cardinality(df):
    low_card = [c for c in df.columns if df[c].nunique() <= 1]
    print(f'  Low cardinality (== 1): {len(low_card)} colunas removidas')
    return df.drop(columns=low_card)

def remove_high_correlation(df, threshold=0.8, safras_train=None):
    if safras_train is not None:
        df_corr_base = df[df['SAFRA'].isin(safras_train)]
    else:
        df_corr_base = df
    df_sample = df_corr_base.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
        lambda x: x.sample(frac=0.25, random_state=42))
    num_cols = df_sample.select_dtypes(include=['int32','int64','float32','float64']).columns
    num_cols = [c for c in num_cols if c != 'FPD']
    corr_matrix = df_sample[num_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = []
    while True:
        max_corr = upper.max().max()
        if max_corr < threshold:
            break
        col_to_drop = upper.max().sort_values(ascending=False).index[0]
        to_drop.append(col_to_drop)
        upper = upper.drop(index=col_to_drop, columns=col_to_drop)
    print(f'  High correlation (> {threshold}): {len(to_drop)} colunas removidas')
    return df.drop(columns=to_drop)

def remove_misused_columns(df):
    misused = ['PROD', 'flag_mig2', 'FAT_VLR_FPD', 'FAT_FLAG_MIG2_AQUISICAO']
    existing = [c for c in misused if c in df.columns]
    if existing:
        print(f'  Misused columns removed: {existing}')
    return df.drop(columns=existing, errors='ignore')

print('Funcoes de limpeza carregadas')

In [None]:
# =============================================================================
# 4.2 APLICAR LIMPEZAS
# =============================================================================
safras_train_val = SAFRAS[:4]

print('Aplicando limpezas...')
print(f'Shape original: {df.shape}')

df = clean_empty_keys(df)
df = convert_cep3_uf_regiao(df)
df = adjust_and_drop_date_cols(df)
df = remove_high_missing(df)
df = remove_low_cardinality(df)
df = remove_high_correlation(df, threshold=0.8, safras_train=safras_train_val)
df = remove_misused_columns(df)

print(f'Shape apos limpezas: {df.shape}')

## 5. Split Temporal e Amostragem Estratificada

In [None]:
# =============================================================================
# 5. SPLIT TEMPORAL + AMOSTRAGEM ESTRATIFICADA
# =============================================================================
safras_ord = sorted(df['SAFRA'].unique())
safras_train_oos = safras_ord[:4]  # 202410-202501
safras_oot = safras_ord[4:]        # 202502-202503

df_4_safras = df[df['SAFRA'].isin(safras_train_oos)]
df_oot_full = df[df['SAFRA'].isin(safras_oot)]

# Amostragem estratificada 25%
df_sample = df_4_safras.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
    lambda x: x.sample(frac=0.25, random_state=42))
df_oos = df_4_safras.drop(df_sample.index)

df_sample = df_sample.reset_index(drop=True).drop_duplicates()
df_oos = df_oos.reset_index(drop=True).drop_duplicates()
df_oot = df_oot_full.reset_index(drop=True).drop_duplicates()

del df, df_4_safras, df_oot_full
gc.collect()

print(f'Sample (train+val): {df_sample.shape}')
print(f'OOS:                {df_oos.shape}')
print(f'OOT:                {df_oot.shape}')

# Verificar volumetria
for name, data in [('Sample', df_sample), ('OOS', df_oos), ('OOT', df_oot)]:
    print(f'\n--- {name} ---')
    print(data[['SAFRA', 'FPD']].value_counts().sort_index().to_string())

In [None]:
# =============================================================================
# 6. SEPARACAO TREINO / VALIDACAO / X / Y
# =============================================================================
safras_train = [202410, 202411, 202412]
safras_val = [202501]

df_train = df_sample[df_sample['SAFRA'].isin(safras_train)]
df_val = df_sample[df_sample['SAFRA'].isin(safras_val)]

X_train = df_train.drop(columns=['FPD'])
y_train = df_train['FPD']
X_val = df_val.drop(columns=['FPD'])
y_val = df_val['FPD']

X_train_final = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_train_final = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

X_oos_agg = df_oos.drop(columns=['FPD'])
y_oos_agg = df_oos['FPD']
X_oot_agg = df_oot.drop(columns=['FPD'])
y_oot_agg = df_oot['FPD']

# Variaveis numericas e categoricas
num_features = [n for n in X_train.select_dtypes(include=['int32','int64','float32','float64']).columns if n != 'SAFRA']
cat_features = [c for c in X_train.select_dtypes(include=['object','category']).columns if c != 'NUM_CPF']

print(f'X_train_final: {X_train_final.shape}')
print(f'X_oos: {X_oos_agg.shape}')
print(f'X_oot: {X_oot_agg.shape}')
print(f'Numericas: {len(num_features)} | Categoricas: {len(cat_features)}')

## 7. Pipeline LGBM + GridSearch + Treino

In [None]:
# =============================================================================
# 7. PIPELINE + GRIDSEARCH + TREINO FINAL
# =============================================================================
numeric_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
])
preprocess_lgbm = ColumnTransformer([
    ('num', numeric_pipe, num_features),
    ('cat', categorical_pipe, cat_features),
], remainder='drop')

pipeline_LGBM = Pipeline([
    ('prep', preprocess_lgbm),
    ('model', LGBMClassifier(
        objective='binary', boosting_type='gbdt',
        learning_rate=0.05, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, verbosity=-1,
    )),
])

# GridSearch
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
param_grid_LGBM = {
    'model__n_estimators': [250, 500],
    'model__max_depth': [4, 7],
}
grid_LGBM = GridSearchCV(
    pipeline_LGBM, param_grid=param_grid_LGBM,
    scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3, error_score='raise',
)
grid_LGBM.fit(X_val, y_val)
print(f'Melhores HP: {grid_LGBM.best_params_}')
print(f'Melhor AUC:  {grid_LGBM.best_score_:.5f}')

# Treino final com best params
pipeline_LGBM.set_params(**grid_LGBM.best_params_)
pipeline_LGBM.fit(X_train_final, y_train_final)
print('LGBM treinado com train+val')

In [None]:
# =============================================================================
# 8. FUNCOES DE AVALIACAO + AVALIACAO BASELINE + MLFLOW
# =============================================================================

def ks_stat(y_true, y_score):
    df_ks = pd.DataFrame({'y': y_true.values if hasattr(y_true, 'values') else y_true, 'p': y_score})
    df_ks = df_ks.sort_values('p')
    df_ks['cum_good'] = (1 - df_ks['y']).cumsum() / (1 - df_ks['y']).sum()
    df_ks['cum_bad'] = df_ks['y'].cumsum() / df_ks['y'].sum()
    return np.max(np.abs(df_ks['cum_bad'] - df_ks['cum_good']))

def evaluation_auc_ks(X, y, pipe, name='', verbose=True):
    proba = pipe.predict_proba(X)[:, 1]
    auc = round(roc_auc_score(y, proba), 5)
    ks = round(ks_stat(y, proba), 5)
    if verbose:
        print(f'  {name}: AUC={auc}, KS={ks}')
    return auc, ks

def filter_xy_by_safra(X, y, list_safras):
    mask = X['SAFRA'].isin(list_safras)
    return X[mask], y.loc[X[mask].index]

def _sanitize_mlflow_key(key):
    safe = re.sub(r'[^a-zA-Z0-9_\-]', '_', key)
    return re.sub(r'_+', '_', safe).strip('_')

# Mapa de splits
dict_safras = {
    'TREINO - 202410': [202410], 'TREINO - 202411': [202411],
    'TREINO - 202412': [202412], 'TREINO / VAL - 202501': [202501],
    'TREINO (CONS)': [202410, 202411, 202412, 202501],
    'OOS - 202410': [202410], 'OOS - 202411': [202411],
    'OOS - 202412': [202412], 'OOS - 202501': [202501],
    'OOS (CONS)': [202410, 202411, 202412, 202501],
    'OOT - 202502': [202502], 'OOT - 202503': [202503],
    'OOT GERAL (CONS)': [202502, 202503],
}

def generate_map_step_data(X_train, y_train, X_oos, y_oos, X_oot, y_oot):
    base = {}
    for key in dict_safras:
        if 'TREINO' in key:
            base[key] = {'X': X_train, 'Y': y_train}
        elif 'OOS' in key:
            base[key] = {'X': X_oos, 'Y': y_oos}
        else:
            base[key] = {'X': X_oot, 'Y': y_oot}
    return base

# Avaliacao LGBM baseline + MLflow
with mlflow.start_run(run_name='LightGBM_Baseline_v5') as run_lgbm:
    params = pipeline_LGBM.named_steps['model'].get_params()
    mlflow.log_param('model_type', 'LightGBM')
    mlflow.log_param('n_estimators', params.get('n_estimators'))
    mlflow.log_param('max_depth', params.get('max_depth'))
    mlflow.log_param('learning_rate', params.get('learning_rate'))
    mlflow.log_param('n_features', len(X_train_final.columns))
    mlflow.log_param('n_samples_train', len(X_train_final))

    print('Avaliacao LGBM baseline por base:')
    map_data = generate_map_step_data(X_train_final, y_train_final, X_oos_agg, y_oos_agg, X_oot_agg, y_oot_agg)
    for key in dict_safras:
        X_f, y_f = filter_xy_by_safra(map_data[key]['X'], map_data[key]['Y'], dict_safras[key])
        auc, ks = evaluation_auc_ks(X_f, y_f, pipeline_LGBM, key)
        safe_key = _sanitize_mlflow_key(key)
        mlflow.log_metric(f'LGBM_AUC_{safe_key}', auc)
        mlflow.log_metric(f'LGBM_KS_{safe_key}', ks)

    mlflow.sklearn.log_model(pipeline_LGBM, 'model_lightgbm_baseline')
    print(f'\nMLflow Run ID: {run_lgbm.info.run_id}')

## 9. Feature Selection (SHAP TreeExplainer)

Mede contribuicao real de cada feature para a predicao de FPD, capturando interacoes.
Pipeline: Treinar LGBM -> SHAP values -> Ranking global -> Selecionar 90% cumulativo.

In [None]:
# =============================================================================
# 9.1 TREINAR LGBM PARA SHAP + CALCULAR SHAP VALUES
# =============================================================================
def _var_num(col):
    m = re.search(r'var_(\d+)', col)
    return int(m.group(1)) if m else -1

feats_cadastro = [x for x in X_train_final.columns if 'var_' in x and _var_num(x) <= 25]
feats_cadastro += [c for c in ['STATUSRF', 'UF', 'REGIAO', 'DIAS_VAR_12'] if c in X_train_final.columns]

feats_to_use = [c for c in X_train_final.columns if c not in feats_cadastro and c not in ['NUM_CPF', 'SAFRA']]
X_shap = X_train_final[feats_to_use].copy()

num_shap = [n for n in X_shap.select_dtypes(include=['int32','int64','float32','float64']).columns]
cat_shap = [c for c in X_shap.select_dtypes(include=['object','category']).columns]

prep_shap = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median'))]), num_shap),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
    ]), cat_shap),
], remainder='drop')

pipe_shap = Pipeline([
    ('prep', prep_shap),
    ('model', LGBMClassifier(
        objective='binary', boosting_type='gbdt', learning_rate=0.05,
        n_estimators=300, max_depth=7, colsample_bytree=0.8, subsample=0.8,
        random_state=42, n_jobs=-1, verbosity=-1,
    )),
])
pipe_shap.fit(X_shap, y_train_final)

# SHAP values
X_shap_transformed = pipe_shap.named_steps['prep'].transform(X_shap)
try:
    transformed_names = pipe_shap.named_steps['prep'].get_feature_names_out()
except Exception:
    transformed_names = num_shap + cat_shap

explainer = shap.TreeExplainer(pipe_shap.named_steps['model'])
shap_values = explainer.shap_values(X_shap_transformed)
shap_vals = shap_values[1] if isinstance(shap_values, list) else shap_values

print(f'SHAP values: {shap_vals.shape}')

# Ranking
_original_features_ordered = num_shap + cat_shap

def _map_transformed_to_original(name):
    raw = name.split('__', 1)[-1] if '__' in name else name
    try:
        idx = int(raw)
        if 0 <= idx < len(_original_features_ordered):
            return _original_features_ordered[idx]
    except ValueError:
        pass
    return raw

mean_abs_shap = np.abs(shap_vals).mean(axis=0)
df_shap_ranking = pd.DataFrame({
    'feature_transformed': list(transformed_names),
    'mean_abs_shap': mean_abs_shap,
}).sort_values('mean_abs_shap', ascending=False).reset_index(drop=True)

df_shap_ranking['feature'] = df_shap_ranking['feature_transformed'].apply(_map_transformed_to_original)

def get_book_label(feat):
    if feat.startswith('REC_'): return 'Recarga (REC_)'
    elif feat.startswith('PAG_'): return 'Pagamento (PAG_)'
    elif feat.startswith('FAT_'): return 'Faturamento (FAT_)'
    return 'Base (Telco+Score)'

df_shap_ranking['book'] = df_shap_ranking['feature'].apply(get_book_label)
total_shap = df_shap_ranking['mean_abs_shap'].sum()
df_shap_ranking['pct_importance'] = df_shap_ranking['mean_abs_shap'] / total_shap
df_shap_ranking['cumulative_pct'] = df_shap_ranking['pct_importance'].cumsum()
df_shap_ranking['rank'] = range(1, len(df_shap_ranking) + 1)

print(f'\nTop 15 Features (SHAP):')
print(df_shap_ranking[['rank','feature','book','mean_abs_shap','pct_importance']].head(15).to_string(index=False))

In [None]:
# =============================================================================
# 9.2 VISUALIZACOES SHAP + PARETO 90% + EXPORT
# =============================================================================
BOOK_COLORS = {
    'Base (Telco+Score)': '#607D8B', 'Recarga (REC_)': '#2196F3',
    'Pagamento (PAG_)': '#FF9800', 'Faturamento (FAT_)': '#9C27B0',
}
book_names = list(BOOK_COLORS.keys())

def get_feature_color(feat):
    if feat.startswith('REC_') or '__REC_' in feat: return BOOK_COLORS['Recarga (REC_)']
    elif feat.startswith('PAG_') or '__PAG_' in feat: return BOOK_COLORS['Pagamento (PAG_)']
    elif feat.startswith('FAT_') or '__FAT_' in feat: return BOOK_COLORS['Faturamento (FAT_)']
    return BOOK_COLORS['Base (Telco+Score)']

# SHAP Beeswarm
fig, ax = plt.subplots(figsize=(12, 14))
shap.summary_plot(shap_vals, X_shap_transformed,
                  feature_names=list(transformed_names), max_display=40, show=False)
plt.title('SHAP Summary Plot - Top 40 Features (FPD)', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR_V5}/shap_summary_beeswarm.png', dpi=DPI, bbox_inches='tight')
plt.show()

# Top 30 by Book
top30 = df_shap_ranking.head(30)
colors_30 = [get_feature_color(f) for f in top30['feature']]
fig, ax = plt.subplots(figsize=(12, 10))
ax.barh(range(len(top30)-1, -1, -1), top30['mean_abs_shap'].values,
        color=colors_30, alpha=0.85, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(len(top30)-1, -1, -1))
ax.set_yticklabels(top30['feature'].values, fontsize=9)
ax.set_xlabel('mean(|SHAP value|)', fontsize=11)
ax.set_title('Top 30 Features por Importancia SHAP', fontsize=14, fontweight='bold')
legend_elements = [Patch(facecolor=c, label=l) for l, c in BOOK_COLORS.items()]
ax.legend(handles=legend_elements, loc='lower right', fontsize=10)
ax.grid(True, alpha=0.2, axis='x')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR_V5}/shap_top30_by_book.png', dpi=DPI, bbox_inches='tight')
plt.show()

# Pareto 90%
CUMULATIVE_THRESHOLD = 0.90
fig, ax = plt.subplots(figsize=(14, 6))
x_range = range(1, len(df_shap_ranking) + 1)
colors_cum = [get_feature_color(f) for f in df_shap_ranking['feature']]
ax.bar(x_range, df_shap_ranking['pct_importance'].values, color=colors_cum, alpha=0.7, width=1.0)
ax2_p = ax.twinx()
ax2_p.plot(x_range, df_shap_ranking['cumulative_pct'].values, color='red', linewidth=2)
ax2_p.axhline(y=CUMULATIVE_THRESHOLD, color='red', linestyle='--', alpha=0.5)
n_90 = (df_shap_ranking['cumulative_pct'] <= CUMULATIVE_THRESHOLD).sum()
ax2_p.axvline(x=n_90, color='green', linestyle='--', alpha=0.7)
ax2_p.annotate(f'{n_90} features = 90%', xy=(n_90, 0.90), xytext=(n_90+20, 0.80),
              arrowprops=dict(arrowstyle='->', color='green'), fontsize=11, fontweight='bold', color='green')
ax.set_xlabel('Feature (rank SHAP)'); ax.set_ylabel('Importancia Individual (%)')
ax2_p.set_ylabel('Cumulativo (%)')
ax.set_title(f'Pareto — {n_90} features capturam 90% importancia SHAP', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR_V5}/shap_pareto_cumulative.png', dpi=DPI, bbox_inches='tight')
plt.show()

# Selecionar features
selected_mask = df_shap_ranking['cumulative_pct'] <= CUMULATIVE_THRESHOLD
if not selected_mask.all():
    first_over = selected_mask[~selected_mask].index[0]
    selected_mask.iloc[:first_over + 1] = True
final_set_features = df_shap_ranking[selected_mask]['feature'].unique().tolist()
print(f'Features selecionadas (SHAP >= 90%): {len(final_set_features)}')

# Export SHAP artifacts
shap_dir = '/tmp/shap_artifacts'
os.makedirs(shap_dir, exist_ok=True)
df_shap_ranking.to_csv(f'{shap_dir}/shap_feature_ranking.csv', index=False)
with open(f'{shap_dir}/selected_features_shap.pkl', 'wb') as f:
    pickle.dump(final_set_features, f)

with mlflow.start_run(run_name='SHAP_Feature_Selection_v5'):
    mlflow.set_tag('task', 'feature_selection')
    mlflow.log_param('n_features_total', len(feats_to_use))
    mlflow.log_param('n_features_selected', len(final_set_features))
    for fig_path in [f'{OUTPUT_DIR_V5}/shap_summary_beeswarm.png',
                     f'{OUTPUT_DIR_V5}/shap_top30_by_book.png',
                     f'{OUTPUT_DIR_V5}/shap_pareto_cumulative.png']:
        if os.path.exists(fig_path):
            mlflow.log_artifact(fig_path, 'shap_plots')
    mlflow.log_artifact(f'{shap_dir}/shap_feature_ranking.csv', 'feature_selection')
    mlflow.log_artifact(f'{shap_dir}/selected_features_shap.pkl', 'feature_selection')

## 10. Modelo Final com Features Selecionadas

In [None]:
# =============================================================================
# 10. TREINO FINAL LGBM COM FEATURES SHAP + AVALIACAO + MLFLOW
# =============================================================================
dimensions = ['NUM_CPF', 'SAFRA']
final_features_with_dims = final_set_features + [d for d in dimensions if d not in final_set_features]

X_tr_final_fs = X_train_final[final_features_with_dims]
X_oos_fs = X_oos_agg[final_features_with_dims]
X_oot_fs = X_oot_agg[final_features_with_dims]

nf_final = [n for n in X_tr_final_fs.select_dtypes(include=['int32','int64','float32','float64']).columns if n != 'SAFRA']
cf_final = [c for c in X_tr_final_fs.select_dtypes(include=['object','category']).columns if c != 'NUM_CPF']

prep_final = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median'))]), nf_final),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
    ]), cf_final),
], remainder='drop')

pipeline_LGBM_final = Pipeline([
    ('prep', prep_final),
    ('model', LGBMClassifier(
        objective='binary', boosting_type='gbdt', learning_rate=0.05,
        max_depth=grid_LGBM.best_params_['model__max_depth'],
        n_estimators=grid_LGBM.best_params_['model__n_estimators'],
        colsample_bytree=0.8, random_state=42, n_jobs=-1, verbosity=-1,
    )),
])

list_results_fs = []

with mlflow.start_run(run_name='Final_LGBM_SHAP_v5') as run_final:
    pipeline_LGBM_final.fit(X_tr_final_fs, y_train_final)

    mlflow.log_param('model_type', 'LightGBM')
    mlflow.log_param('n_features', len(final_set_features))
    mlflow.log_param('feature_selection', 'SHAP_TreeExplainer_90pct')
    model_params = pipeline_LGBM_final.named_steps['model'].get_params()
    for k, v in model_params.items():
        if isinstance(v, (int, float, str, bool)):
            mlflow.log_param(f'model__{k}', v)

    print(f"{'='*60}")
    print(f'MODELO FINAL: LGBM ({len(final_set_features)} features SHAP)')
    print(f"{'='*60}")

    map_data = generate_map_step_data(X_tr_final_fs, y_train_final, X_oos_fs, y_oos_agg, X_oot_fs, y_oot_agg)
    for key in dict_safras:
        X_f, y_f = filter_xy_by_safra(map_data[key]['X'], map_data[key]['Y'], dict_safras[key])
        auc, ks = evaluation_auc_ks(X_f, y_f, pipeline_LGBM_final, key)
        safe_key = _sanitize_mlflow_key(key)
        mlflow.log_metric(f'AUC_{safe_key}', auc)
        mlflow.log_metric(f'KS_{safe_key}', ks)
        list_results_fs.append({'MODEL': 'LGBM', 'BASE': key, 'AUC': auc, 'KS': ks})

    mlflow.sklearn.log_model(pipeline_LGBM_final, 'model_final_lgbm_shap')
    run_final_id = run_final.info.run_id
    print(f'\nMLflow Run ID: {run_final_id}')

df_results_fs = pd.DataFrame(list_results_fs)
best_model_pipeline = pipeline_LGBM_final
best_model_name = 'LGBM'
ks_oot_row = df_results_fs[df_results_fs['BASE'] == 'OOT GERAL (CONS)']
best_ks_oot = ks_oot_row['KS'].values[0] if len(ks_oot_row) > 0 else 0

print(f'\nLGBM Final: KS OOT = {best_ks_oot:.5f}')

## 11. Swap Analysis — Modelo vs FPD Real

Compara o ranking do modelo (P(FPD=1)) com o ranking real (flag FPD da base score_bureau_movel_full) na **mesma populacao**.

**Definicoes**:
- **Ranking Oracle**: FPD=1 primeiro, depois FPD=0 (desempate por score modelo)
- **Ranking Modelo**: Ordenar por probabilidade P(FPD=1) descendente
- **Swap-in**: No top X% do MODELO mas NAO no top X% do ORACLE (falsos alarmes)
- **Swap-out**: No top X% do ORACLE mas NAO no top X% do MODELO (maus que escapam)
- **Overlap**: No top X% de AMBOS (acertos do modelo)
- **Capture Rate**: % de todos os FPD=1 capturados no top X% do modelo

In [None]:
# =============================================================================
# 11. SWAP ANALYSIS CORRIGIDO — MODELO vs FPD REAL (mesma populacao)
# =============================================================================
# v5 corrige o bug da v4:
#   v4: comparava OOT1 vs OOT2 (populacoes DIFERENTES) → swap-in = swap-out sempre
#   v5: compara ranking MODELO vs ranking ORACLE (FPD real) na MESMA populacao

print(f'Modelo: {best_model_name}')
print('=' * 60)

scores_oot = best_model_pipeline.predict_proba(X_oot_fs)[:, 1]
df_swap = pd.DataFrame({
    'SAFRA': X_oot_fs['SAFRA'].values,
    'score': scores_oot,
    'FPD': y_oot_agg.values,
})

total_bad = (df_swap['FPD'] == 1).sum()
total_pop = len(df_swap)
print(f'Populacao OOT: {total_pop:,} | Maus (FPD=1): {total_bad:,} ({total_bad/total_pop:.2%})')

swap_results = []
for cutoff in SWAP_CUTOFFS:
    n_top = int(total_pop * cutoff)

    model_top_idx = df_swap.nlargest(n_top, 'score').index
    df_swap['_oracle_rank'] = df_swap['FPD'] * 1e6 + df_swap['score']
    oracle_top_idx = df_swap.nlargest(n_top, '_oracle_rank').index

    overlap_idx = model_top_idx.intersection(oracle_top_idx)
    swap_in_idx = model_top_idx.difference(oracle_top_idx)
    swap_out_idx = oracle_top_idx.difference(model_top_idx)

    bad_captured = (df_swap.loc[model_top_idx, 'FPD'] == 1).sum()
    capture_rate = bad_captured / total_bad if total_bad > 0 else 0
    default_rate_model = df_swap.loc[model_top_idx, 'FPD'].mean()
    default_rate_oracle = df_swap.loc[oracle_top_idx, 'FPD'].mean()

    result = {
        'cutoff': f'{cutoff:.0%}',
        'n_top': n_top,
        'overlap': len(overlap_idx),
        'swap_in': len(swap_in_idx),
        'swap_out': len(swap_out_idx),
        'overlap_pct': len(overlap_idx) / n_top * 100,
        'swap_in_pct': len(swap_in_idx) / n_top * 100,
        'swap_out_pct': len(swap_out_idx) / n_top * 100,
        'capture_rate': capture_rate * 100,
        'default_rate_model': default_rate_model * 100,
        'default_rate_oracle': default_rate_oracle * 100,
    }
    swap_results.append(result)

    print(f"\nTop {cutoff:.0%} (n={n_top:,}):")
    print(f"  Overlap (acertos):      {result['overlap']:>6,} ({result['overlap_pct']:.1f}%)")
    print(f"  Swap-in (falso alarme): {result['swap_in']:>6,} ({result['swap_in_pct']:.1f}%)")
    print(f"  Swap-out (escapados):   {result['swap_out']:>6,} ({result['swap_out_pct']:.1f}%)")
    print(f"  Capture Rate:           {result['capture_rate']:.1f}%")
    print(f"  Default Rate (modelo):  {result['default_rate_model']:.1f}%")
    print(f"  Default Rate (oracle):  {result['default_rate_oracle']:.1f}%")

df_swap.drop(columns=['_oracle_rank'], inplace=True)
df_swap_results = pd.DataFrame(swap_results)

print(f'\n=== Resumo Swap Analysis ===')
print(df_swap_results[['cutoff','n_top','overlap_pct','swap_in_pct','swap_out_pct','capture_rate']].to_string(index=False))


## 12. Analise por Decis + Capture Rate + PSI

- **Decis**: Divide populacao em 10 faixas de score, analisa bad rate e lift por faixa
- **Capture Rate**: % de maus (FPD=1) capturados em cada corte cumulativo
- **PSI**: Mede estabilidade da distribuicao de scores entre safras
  - PSI < 0.10: Estavel | 0.10-0.25: Atencao | > 0.25: Critico

In [None]:
# =============================================================================
# 12. ANALISE POR DECIS + PSI
# =============================================================================

def decile_analysis(y_true, y_score, n_bins=10):
    df_d = pd.DataFrame({'y': y_true, 'score': y_score})
    df_d = df_d.sort_values('score', ascending=False).reset_index(drop=True)
    df_d['decil'] = pd.qcut(df_d['score'], n_bins, labels=False, duplicates='drop')
    df_d['decil'] = df_d['decil'].max() - df_d['decil'] + 1

    result = df_d.groupby('decil').agg(
        n=('y', 'count'), n_bad=('y', 'sum'),
        score_min=('score', 'min'), score_max=('score', 'max'),
        score_mean=('score', 'mean'),
    ).sort_index().reset_index()

    result['n_good'] = result['n'] - result['n_bad']
    result['bad_rate'] = result['n_bad'] / result['n'] * 100
    result['pct_pop'] = result['n'] / result['n'].sum() * 100
    result['pct_bad'] = result['n_bad'] / result['n_bad'].sum() * 100
    result['cum_bad_pct'] = result['pct_bad'].cumsum()
    result['cum_pop_pct'] = result['pct_pop'].cumsum()
    avg_bad_rate = result['n_bad'].sum() / result['n'].sum() * 100
    result['lift'] = result['bad_rate'] / avg_bad_rate
    return result

def calc_psi(expected, actual, bins=10):
    breakpoints = np.linspace(0, 1, bins + 1)
    exp_pct = np.histogram(expected, bins=breakpoints)[0] / len(expected)
    act_pct = np.histogram(actual, bins=breakpoints)[0] / len(actual)
    exp_pct = np.where(exp_pct == 0, 0.0001, exp_pct)
    act_pct = np.where(act_pct == 0, 0.0001, act_pct)
    return float(np.sum((act_pct - exp_pct) * np.log(act_pct / exp_pct)))

# Decis OOT consolidado
print('=== Analise por Decis — OOT Consolidado ===')
dec_oot = decile_analysis(y_oot_agg.values, scores_oot)
print(dec_oot[['decil','n','n_bad','bad_rate','cum_bad_pct','lift','score_min','score_max']].to_string(index=False))

bad_rates = dec_oot['bad_rate'].values
is_monotonic = all(bad_rates[i] >= bad_rates[i+1] for i in range(len(bad_rates)-1))
print(f'\nMonotonia bad_rate: {"SIM" if is_monotonic else "NAO"} (decil 1 deve ter maior bad rate)')

# Decis por SAFRA
safras_oot_list = sorted(df_swap['SAFRA'].unique())
dec_by_safra = {}
for safra in safras_oot_list:
    mask = df_swap['SAFRA'] == safra
    dec = decile_analysis(df_swap.loc[mask, 'FPD'].values, df_swap.loc[mask, 'score'].values)
    dec_by_safra[safra] = dec
    print(f'\n--- Decis SAFRA {safra} ---')
    print(dec[['decil','n','n_bad','bad_rate','cum_bad_pct','lift']].to_string(index=False))

# PSI
scores_train = best_model_pipeline.predict_proba(X_tr_final_fs)[:, 1]

print(f'\n=== PSI (Population Stability Index) ===')
psi_results = []
psi_train_oot = calc_psi(scores_train, scores_oot)
status = 'OK' if psi_train_oot < 0.1 else 'ATENCAO' if psi_train_oot < 0.25 else 'CRITICO'
print(f'PSI Train vs OOT: {psi_train_oot:.4f} [{status}]')
psi_results.append({'comparacao': 'Train vs OOT', 'psi': psi_train_oot, 'status': status})

for safra in safras_oot_list:
    mask = df_swap['SAFRA'] == safra
    psi_val = calc_psi(scores_train, df_swap.loc[mask, 'score'].values)
    status = 'OK' if psi_val < 0.1 else 'ATENCAO' if psi_val < 0.25 else 'CRITICO'
    print(f'PSI Train vs OOT-{safra}: {psi_val:.4f} [{status}]')
    psi_results.append({'comparacao': f'Train vs OOT-{safra}', 'psi': psi_val, 'status': status})

if len(safras_oot_list) >= 2:
    psi_12 = calc_psi(
        df_swap.loc[df_swap['SAFRA'] == safras_oot_list[0], 'score'].values,
        df_swap.loc[df_swap['SAFRA'] == safras_oot_list[1], 'score'].values,
    )
    status = 'OK' if psi_12 < 0.1 else 'ATENCAO' if psi_12 < 0.25 else 'CRITICO'
    print(f'PSI OOT1 vs OOT2: {psi_12:.4f} [{status}]')
    psi_results.append({'comparacao': 'OOT1 vs OOT2', 'psi': psi_12, 'status': status})

df_psi = pd.DataFrame(psi_results)


## 13. Visualizacoes Avancadas (16 graficos)

**Performance**: KS Curve, ROC Curve, Precision-Recall, Score Distribution
**Ranking**: Confusion Matrix, Decile Bad Rate, Decile Lift, Cumulative Bad %
**Estabilidade**: KS por Base, Score por SAFRA, PSI, Swap Stacked Bar
**Analise**: Capture Rate, AUC por Split, Feature Importance, Calibracao

In [None]:
# =============================================================================
# 13.1 VISUALIZACOES — PERFORMANCE (8 graficos)
# =============================================================================
fig, axes = plt.subplots(2, 4, figsize=(28, 12))
fig.suptitle(f'Modelo {best_model_name} — Performance OOT', fontsize=16, fontweight='bold')

# 1. KS Curve
ax = axes[0, 0]
df_ks = pd.DataFrame({'y': y_oot_agg.values, 'score': scores_oot}).sort_values('score')
df_ks['cum_good'] = (1 - df_ks['y']).cumsum() / (1 - df_ks['y']).sum()
df_ks['cum_bad'] = df_ks['y'].cumsum() / df_ks['y'].sum()
df_ks['ks_diff'] = np.abs(df_ks['cum_bad'] - df_ks['cum_good'])
ks_max_idx = df_ks['ks_diff'].idxmax()
ks_max_val = df_ks.loc[ks_max_idx, 'ks_diff']
x_pct = np.linspace(0, 1, len(df_ks))
ax.plot(x_pct, df_ks['cum_good'].values, label='Bons', color=COLORS['blue'])
ax.plot(x_pct, df_ks['cum_bad'].values, label='Maus', color=COLORS['red'])
ks_x = x_pct[df_ks.index.get_loc(ks_max_idx)]
ax.axvline(x=ks_x, color=COLORS['green'], linestyle='--', alpha=0.7)
ax.annotate(f'KS={ks_max_val:.4f}', xy=(ks_x, 0.5), fontsize=10, fontweight='bold', color=COLORS['green'])
ax.set_title('1. KS Curve', fontweight='bold'); ax.set_xlabel('Pop (%)'); ax.set_ylabel('CDF')
ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

# 2. ROC Curve
ax = axes[0, 1]
fpr, tpr, _ = roc_curve(y_oot_agg, scores_oot)
auc_val = roc_auc_score(y_oot_agg, scores_oot)
ax.plot(fpr, tpr, color=COLORS['blue'], linewidth=2, label=f'AUC={auc_val:.4f}')
ax.plot([0,1],[0,1],'k--',alpha=0.3); ax.fill_between(fpr, tpr, alpha=0.1, color=COLORS['blue'])
ax.set_title('2. ROC Curve', fontweight='bold'); ax.set_xlabel('FPR'); ax.set_ylabel('TPR')
ax.legend(fontsize=10); ax.grid(True, alpha=0.3)

# 3. Precision-Recall
ax = axes[0, 2]
prec, rec, _ = precision_recall_curve(y_oot_agg, scores_oot)
ax.plot(rec, prec, color=COLORS['orange'], linewidth=2)
ax.fill_between(rec, prec, alpha=0.1, color=COLORS['orange'])
baseline = y_oot_agg.mean()
ax.axhline(y=baseline, color='gray', linestyle='--', alpha=0.5, label=f'Base={baseline:.3f}')
ax.set_title('3. Precision-Recall', fontweight='bold'); ax.set_xlabel('Recall'); ax.set_ylabel('Precision')
ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

# 4. Score Distribution
ax = axes[0, 3]
ax.hist(scores_oot[y_oot_agg.values==0], bins=50, alpha=0.6, label='Bons', color=COLORS['green'], density=True)
ax.hist(scores_oot[y_oot_agg.values==1], bins=50, alpha=0.6, label='Maus', color=COLORS['red'], density=True)
ax.set_title('4. Score Distribution', fontweight='bold'); ax.set_xlabel('P(FPD=1)'); ax.set_ylabel('Dens.')
ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

# 5. Confusion Matrix
ax = axes[1, 0]
y_pred = (scores_oot >= 0.5).astype(int)
cm = confusion_matrix(y_oot_agg, y_pred)
ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.set_title('5. Confusion Matrix (t=0.5)', fontweight='bold')
ax.set_ylabel('Real'); ax.set_xlabel('Predito')
ax.set_xticks([0,1]); ax.set_yticks([0,1])
ax.set_xticklabels(['Bom','Mau']); ax.set_yticklabels(['Bom','Mau'])
for i in range(2):
    for j in range(2):
        ax.text(j, i, f'{cm[i,j]:,}', ha='center', va='center',
                color='white' if cm[i,j] > cm.max()/2 else 'black', fontsize=11)

# 6. Decile Bad Rate
ax = axes[1, 1]
bars = ax.bar(dec_oot['decil'], dec_oot['bad_rate'], color=COLORS['red'], alpha=0.8, edgecolor='white')
avg_br = dec_oot['n_bad'].sum() / dec_oot['n'].sum() * 100
ax.axhline(y=avg_br, color='gray', linestyle='--', alpha=0.7, label=f'Media={avg_br:.1f}%')
ax.set_title('6. Bad Rate por Decil', fontweight='bold'); ax.set_xlabel('Decil (1=pior)')
ax.set_ylabel('Bad Rate (%)'); ax.legend(fontsize=8); ax.grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars, dec_oot['bad_rate']):
    ax.text(bar.get_x()+bar.get_width()/2, val+0.3, f'{val:.1f}%', ha='center', fontsize=7)

# 7. Decile Lift
ax = axes[1, 2]
ax.bar(dec_oot['decil'], dec_oot['lift'], color=COLORS['purple'], alpha=0.8, edgecolor='white')
ax.axhline(y=1.0, color='gray', linestyle='--', alpha=0.7, label='Lift=1')
ax.set_title('7. Lift por Decil', fontweight='bold'); ax.set_xlabel('Decil (1=pior)')
ax.set_ylabel('Lift'); ax.legend(fontsize=8); ax.grid(True, alpha=0.3, axis='y')

# 8. Cumulative Bad %
ax = axes[1, 3]
ax.plot(dec_oot['cum_pop_pct'], dec_oot['cum_bad_pct'], 'ro-', linewidth=2, markersize=6, label='Modelo')
ax.plot([0,100],[0,100],'k--',alpha=0.3, label='Aleatorio')
ax.fill_between(dec_oot['cum_pop_pct'], dec_oot['cum_bad_pct'], dec_oot['cum_pop_pct'],
                alpha=0.1, color=COLORS['red'])
ax.set_title('8. Curva de Captura', fontweight='bold'); ax.set_xlabel('% Pop')
ax.set_ylabel('% Maus Capturados'); ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig(f'{OUTPUT_DIR_V5}/viz_performance_8plots.png', dpi=DPI, bbox_inches='tight')
plt.show()
print('Graficos 1-8 salvos')


In [None]:
# =============================================================================
# 13.2 VISUALIZACOES — ESTABILIDADE + SWAP + RESUMO (8 graficos)
# =============================================================================
fig, axes = plt.subplots(2, 4, figsize=(28, 12))
fig.suptitle(f'Modelo {best_model_name} — Estabilidade, Swap & Resumo', fontsize=16, fontweight='bold')

# 9. KS por Base
ax = axes[0, 0]
oos_oot_res = df_results_fs[df_results_fs['BASE'].str.contains('OOS|OOT')]
bars = ax.bar(range(len(oos_oot_res)), oos_oot_res['KS'].values, color=COLORS['blue'], edgecolor='white')
ax.set_xticks(range(len(oos_oot_res)))
ax.set_xticklabels(oos_oot_res['BASE'].values, rotation=45, ha='right', fontsize=7)
ax.axhline(y=0.20, color='gray', linestyle='--', alpha=0.5, label='Min KS=0.20')
ax.set_title('9. KS por Base', fontweight='bold'); ax.set_ylabel('KS')
ax.legend(fontsize=8); ax.grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars, oos_oot_res['KS'].values):
    ax.text(bar.get_x()+bar.get_width()/2, val+0.003, f'{val:.3f}', ha='center', fontsize=7)

# 10. Score por SAFRA (Boxplot)
ax = axes[0, 1]
safra_scores = [df_swap.loc[df_swap['SAFRA']==s, 'score'].values for s in safras_oot_list]
bp = ax.boxplot(safra_scores, labels=[str(s) for s in safras_oot_list], patch_artist=True)
for patch, color in zip(bp['boxes'], [COLORS['blue'], COLORS['orange']]):
    patch.set_facecolor(color); patch.set_alpha(0.6)
ax.set_title('10. Score por SAFRA OOT', fontweight='bold')
ax.set_xlabel('SAFRA'); ax.set_ylabel('Score'); ax.grid(True, alpha=0.3, axis='y')

# 11. PSI Bar Chart
ax = axes[0, 2]
psi_colors = [COLORS['green'] if r['psi']<0.1 else COLORS['orange'] if r['psi']<0.25 else COLORS['red']
              for r in psi_results]
ax.barh(range(len(psi_results)), [r['psi'] for r in psi_results], color=psi_colors, edgecolor='white')
ax.set_yticks(range(len(psi_results)))
ax.set_yticklabels([r['comparacao'] for r in psi_results], fontsize=8)
ax.axvline(x=0.10, color='orange', linestyle='--', alpha=0.5, label='Atencao')
ax.axvline(x=0.25, color='red', linestyle='--', alpha=0.5, label='Critico')
ax.set_title('11. PSI', fontweight='bold'); ax.set_xlabel('PSI')
ax.legend(fontsize=7); ax.grid(True, alpha=0.3, axis='x')

# 12. Swap Stacked Bar
ax = axes[0, 3]
x_sw = range(len(df_swap_results))
ax.bar(x_sw, df_swap_results['overlap_pct'], label='Overlap', color=COLORS['green'], alpha=0.8)
ax.bar(x_sw, df_swap_results['swap_in_pct'], bottom=df_swap_results['overlap_pct'],
       label='Swap-in', color=COLORS['orange'], alpha=0.8)
ax.bar(x_sw, df_swap_results['swap_out_pct'],
       bottom=df_swap_results['overlap_pct']+df_swap_results['swap_in_pct'],
       label='Swap-out', color=COLORS['red'], alpha=0.8)
ax.set_xticks(x_sw); ax.set_xticklabels(df_swap_results['cutoff'], fontsize=9)
ax.set_title('12. Swap Analysis', fontweight='bold'); ax.set_xlabel('Top %')
ax.set_ylabel('% do Top'); ax.legend(fontsize=7); ax.grid(True, alpha=0.3, axis='y')

# 13. Capture Rate Curve
ax = axes[1, 0]
pcts = np.arange(0.01, 1.01, 0.01)
capture_rates = []
for p in pcts:
    n = int(total_pop * p)
    top_idx = df_swap.nlargest(n, 'score').index
    captured = (df_swap.loc[top_idx, 'FPD'] == 1).sum()
    capture_rates.append(captured / total_bad * 100)
ax.plot(pcts*100, capture_rates, color=COLORS['red'], linewidth=2, label='Modelo')
ax.plot([0,100],[0,100],'k--',alpha=0.3, label='Aleatorio')
ax.fill_between(pcts*100, capture_rates, pcts*100, alpha=0.1, color=COLORS['red'])
for cutoff in SWAP_CUTOFFS:
    idx = int(cutoff*100)-1
    ax.plot(cutoff*100, capture_rates[idx], 'go', markersize=8)
    ax.annotate(f'{capture_rates[idx]:.0f}%', xy=(cutoff*100, capture_rates[idx]),
               xytext=(5,5), textcoords='offset points', fontsize=8, color=COLORS['green'])
ax.set_title('13. Capture Rate', fontweight='bold'); ax.set_xlabel('% Pop Avaliada')
ax.set_ylabel('% Maus Capturados'); ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

# 14. AUC por Split
ax = axes[1, 1]
groups = {'TREINO': [], 'OOS': [], 'OOT': []}
for _, row in df_results_fs.iterrows():
    if 'TREINO' in row['BASE'] and 'CONS' not in row['BASE']:
        groups['TREINO'].append(row['AUC'])
    elif 'OOS' in row['BASE'] and 'CONS' not in row['BASE']:
        groups['OOS'].append(row['AUC'])
    elif 'OOT' in row['BASE'] and 'CONS' not in row['BASE']:
        groups['OOT'].append(row['AUC'])
gm = {k: np.mean(v) if v else 0 for k, v in groups.items()}
bars = ax.bar(gm.keys(), gm.values(), color=[COLORS['blue'],COLORS['orange'],COLORS['red']],
              alpha=0.8, edgecolor='white')
ax.axhline(y=0.65, color='gray', linestyle='--', alpha=0.5, label='Min AUC')
ax.set_title('14. AUC Medio por Split', fontweight='bold'); ax.set_ylabel('AUC')
ax.legend(fontsize=8); ax.grid(True, alpha=0.3, axis='y')
for bar, val in zip(bars, gm.values()):
    ax.text(bar.get_x()+bar.get_width()/2, val+0.002, f'{val:.4f}', ha='center', fontsize=9)

# 15. Feature Importance (top 20)
ax = axes[1, 2]
lgbm_final = best_model_pipeline.named_steps['model']
try:
    feat_names_final = best_model_pipeline.named_steps['prep'].get_feature_names_out()
except Exception:
    feat_names_final = nf_final + cf_final
df_imp = pd.DataFrame({'feature': feat_names_final, 'importance': lgbm_final.feature_importances_})
df_imp = df_imp.sort_values('importance', ascending=False).head(20)
colors_imp = [get_feature_color(f.split('__')[-1] if '__' in f else f) for f in df_imp['feature']]
ax.barh(range(len(df_imp)-1,-1,-1), df_imp['importance'].values, color=colors_imp, alpha=0.85)
ax.set_yticks(range(len(df_imp)-1,-1,-1))
ax.set_yticklabels(df_imp['feature'].values, fontsize=7)
ax.set_title('15. Top 20 Feature Importance', fontweight='bold')
ax.set_xlabel('Importance'); ax.grid(True, alpha=0.3, axis='x')

# 16. Calibration
ax = axes[1, 3]
df_cal = pd.DataFrame({'y': y_oot_agg.values, 'score': scores_oot})
df_cal['bin'] = pd.qcut(df_cal['score'], 10, duplicates='drop')
cal = df_cal.groupby('bin', observed=True).agg(pred=('score','mean'), obs=('y','mean')).reset_index()
ax.plot(cal['pred'], cal['obs'], 'bo-', markersize=8, label='Modelo')
ax.plot([0, cal['pred'].max()], [0, cal['pred'].max()], 'k--', alpha=0.3, label='Perfeito')
ax.set_title('16. Calibracao', fontweight='bold'); ax.set_xlabel('Score Predito')
ax.set_ylabel('Bad Rate Obs'); ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig(f'{OUTPUT_DIR_V5}/viz_stability_swap_8plots.png', dpi=DPI, bbox_inches='tight')
plt.show()
print('Graficos 9-16 salvos')


In [None]:
# =============================================================================
# 13.3 LOG VISUALIZACOES NO MLFLOW
# =============================================================================
import glob as glob_mod

with mlflow.start_run(run_name='Final_Visualizations_v5') as run_viz:
    mlflow.set_tag('task', 'visualization')
    mlflow.set_tag('version', 'v5')

    for fig_path in glob_mod.glob(f'{OUTPUT_DIR_V5}/*.png'):
        mlflow.log_artifact(fig_path, 'plots_v5')

    df_swap_results.to_csv(f'{OUTPUT_DIR_V5}/swap_analysis_results.csv', index=False)
    mlflow.log_artifact(f'{OUTPUT_DIR_V5}/swap_analysis_results.csv', 'swap_analysis')

    dec_oot.to_csv(f'{OUTPUT_DIR_V5}/decile_analysis_oot.csv', index=False)
    mlflow.log_artifact(f'{OUTPUT_DIR_V5}/decile_analysis_oot.csv', 'decile_analysis')

    df_psi.to_csv(f'{OUTPUT_DIR_V5}/psi_results.csv', index=False)
    mlflow.log_artifact(f'{OUTPUT_DIR_V5}/psi_results.csv', 'psi')

    mlflow.log_metric('ks_oot', ks_max_val)
    mlflow.log_metric('auc_oot', auc_val)
    mlflow.log_metric('gini_oot', (2 * auc_val - 1) * 100)
    mlflow.log_metric('psi_train_vs_oot', psi_train_oot)

    print(f'MLflow Run ID (viz): {run_viz.info.run_id}')
    print(f'Artifacts: {OUTPUT_DIR_V5}/')
    print(f'16 visualizacoes logadas no MLflow')


## 14. Export do Modelo

Export do modelo final via MLflow Model Registry + pickle local.
Promove automaticamente para Production no Registry do Fabric.

In [None]:
# =============================================================================
# 14. EXPORT DO MODELO PARA MLFLOW REGISTRY
# =============================================================================
sys.path.insert(0, '/lakehouse/default/Files/projeto-final/5-treinamento-modelos')
from export_model import export_model, promote_to_production

feature_names_export = [f for f in final_set_features if f not in ['NUM_CPF', 'SAFRA']]
X_oot_exp = X_oot_fs.drop(columns=['NUM_CPF', 'SAFRA'], errors='ignore')

print(f'Modelo: {best_model_name} (SHAP feature selection)')
print(f'Features: {len(feature_names_export)}')
print(f'KS OOT: {best_ks_oot:.5f}')
print('=' * 60)

ks_oot_final = ks_stat(y_oot_agg, best_model_pipeline.predict_proba(X_oot_fs)[:, 1])
auc_oot_final = roc_auc_score(y_oot_agg, best_model_pipeline.predict_proba(X_oot_fs)[:, 1])
ks_oos_final = ks_stat(y_oos_agg, best_model_pipeline.predict_proba(X_oos_fs)[:, 1])
auc_oos_final = roc_auc_score(y_oos_agg, best_model_pipeline.predict_proba(X_oos_fs)[:, 1])

metrics = {
    'ks_oot': ks_oot_final, 'auc_oot': auc_oot_final,
    'ks_oos': ks_oos_final, 'auc_oos': auc_oos_final,
    'gini_oot': (2 * auc_oot_final - 1) * 100,
    'gini_oos': (2 * auc_oos_final - 1) * 100,
    'psi_train_vs_oot': psi_train_oot,
    'n_features': len(feature_names_export),
    'version': 'v5',
}

X_oos_501, y_oos_501 = filter_xy_by_safra(X_oos_fs, y_oos_agg, [202501])
if len(y_oos_501) > 0:
    proba_501 = best_model_pipeline.predict_proba(X_oos_501)[:, 1]
    metrics['ks_oos_202501'] = ks_stat(y_oos_501, proba_501)
    metrics['auc_oos_202501'] = roc_auc_score(y_oos_501, proba_501)
    metrics['gini_oos_202501'] = (2 * metrics['auc_oos_202501'] - 1) * 100

result = export_model(
    pipeline=best_model_pipeline, model_name='lgbm_baseline_v5',
    X_test=X_oot_exp, y_test=y_oot_agg,
    feature_names=feature_names_export,
    metrics_dict=metrics,
)

print(f"\nModelo exportado: {result['registered_name']}")
print(f"  MLflow Run ID: {result['mlflow_run_id']}")
print(f"  PKL: {result['pkl_path']}")
print(f"  KS OOT: {ks_oot_final:.5f}")
print(f"  KS OOS: {ks_oos_final:.5f}")
print(f"  Gini OOT: {metrics['gini_oot']:.1f}%")
print(f"  PSI: {psi_train_oot:.4f}")

print(f"\n{'='*60}")
print('Promovendo modelo para Production...')
version = promote_to_production(result['registered_name'])
print(f"Modelo {result['registered_name']} v{version} em Production!")
print(f"{'='*60}")
print('\nExport + Promote concluido!')
print('v5 completo — swap corrigido, 16 visualizacoes, decis, PSI')
