In [None]:
%%configure

{
    "conf": {
        "spark.driver.maxResultSize": "8g",
        "spark.driver.memory": "54g",
        "spark.driver.cores": 8,
        "spark.executor.instances": 0,
        "spark.sql.execution.arrow.pyspark.enabled": "true",
        "spark.sql.execution.arrow.pyspark.selfDestruct.enabled": "true"
    }
}

# Feature Selection com SHAP — Credit Risk FPD

**Objetivo**: Identificar as features mais relevantes para predizer FPD usando SHAP
(SHapley Additive exPlanations), gerando rankings por book e visualizacoes para
apresentacao.

### Metodologia

1. **Treinar LGBM baseline** com todas as features disponveis
2. **SHAP TreeExplainer** — calcula contribuicao real de cada feature (captura interacoes)
3. **Rankings por book**: REC_ (Recarga), PAG_ (Pagamento), FAT_ (Faturamento), Base
4. **Selecao**: Features que acumulam 90% da importancia SHAP total
5. **Comparacao** com metodo atual (IV + L1 + correlacao)

### Por que SHAP?

- **IV** e univariado — nao captura interacoes entre features
- **LGBM feature_importances_** e enviesado por cardinalidade e numero de splits
- **SHAP** mede a contribuicao marginal real (teoria dos jogos), captura direcao e interacoes
- **TreeExplainer** e O(TLD) — rapido, nao re-prediz (~5 min para nosso dataset)

### Impacto

- Tempo extra de execucao: ~5-8 min (negligivel vs pipeline total)
- Entregaveis: 6 graficos de apresentacao + lista de features selecionadas

In [None]:
# =============================================================================
# 1. IMPORTS E CONFIGURACAO
# =============================================================================
import gc
import pickle
import os
import pandas as pd
import numpy as np
import shap
import mlflow
import mlflow.sklearn
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from category_encoders import CountEncoder
from scipy.stats import ks_2samp
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, IntegerType, DoubleType, LongType

warnings.filterwarnings('ignore')

# Config centralizado
import sys; sys.path.insert(0, '/lakehouse/default/Files/projeto-final')
from config.pipeline_config import (
    PATH_FEATURE_STORE, EXPERIMENT_NAME, SAFRAS,
    LEAKAGE_BLACKLIST, TARGET_COLUMNS
)

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%H:%M:%S')
logger = logging.getLogger('feature_selection_shap')

# Diretorio de artefatos
ARTIFACTS_DIR = '/tmp/shap_feature_selection'
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

print('Imports OK')

In [None]:
# =============================================================================
# 2. MLFLOW SETUP
# =============================================================================
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog(disable=True)

print(f'MLflow experiment: {EXPERIMENT_NAME}')

In [None]:
# =============================================================================
# 3. LEITURA OTIMIZADA DO GOLD FEATURE STORE
# =============================================================================
logger.info('Carregando feature store...')

# ---- 3.1 Arrow habilitado via %%configure ----
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')
spark.conf.set('spark.sql.execution.arrow.pyspark.selfDestruct.enabled', 'true')

df_spark = spark.read.format('delta').load(PATH_FEATURE_STORE)
n_original = df_spark.count()
print(f'Original: {n_original:,} rows x {len(df_spark.columns)} cols')

# ---- 3.2 Drop audit + leakage ----
cols_audit = ['_execution_id', '_data_inclusao', '_data_alteracao_silver', 'DT_PROCESSAMENTO']
cols_drop = [c for c in cols_audit + LEAKAGE_BLACKLIST if c in df_spark.columns]
if cols_drop:
    df_spark = df_spark.drop(*cols_drop)
    print(f'Drop {len(cols_drop)} colunas (audit+leakage): {cols_drop}')

# ---- 3.3 Filtrar FLAG_INSTALACAO == 1 ----
if 'FLAG_INSTALACAO' in df_spark.columns:
    n_reprovados = df_spark.filter(F.col('FLAG_INSTALACAO') == 0).count()
    df_spark = df_spark.filter(F.col('FLAG_INSTALACAO') == 1).drop('FLAG_INSTALACAO')
    n_pos = n_original - n_reprovados
    print(f'FLAG_INSTALACAO: {n_original:,} -> {n_pos:,} ({n_reprovados:,} reprovados removidos)')

# ---- 3.4 Cast tipos via .select() ----
cast_exprs = []
for field in df_spark.schema.fields:
    if isinstance(field.dataType, DoubleType):
        cast_exprs.append(F.col(field.name).cast(FloatType()).alias(field.name))
    elif isinstance(field.dataType, LongType):
        cast_exprs.append(F.col(field.name).cast(IntegerType()).alias(field.name))
    else:
        cast_exprs.append(F.col(field.name))
df_spark = df_spark.select(*cast_exprs)

# ---- 3.5 Conversao chunked por SAFRA ----
safras_disponiveis = sorted([row.SAFRA for row in df_spark.select('SAFRA').distinct().collect()])
print(f'SAFRAs: {safras_disponiveis} | Colunas: {len(df_spark.columns)}')

chunks = []
for safra in safras_disponiveis:
    chunk = df_spark.filter(F.col('SAFRA') == safra).toPandas()
    print(f'  SAFRA {safra}: {len(chunk):,} rows')
    chunks.append(chunk)
    gc.collect()

df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f'\nDataset: {df.shape}')
print(f'Memory: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB')

In [None]:
# =============================================================================
# 4. LIMPEZA BASICA (mesmo criterio do v3)
# =============================================================================
print(f'Shape pre-limpeza: {df.shape}')

# 4.1 Remove chaves vazias
df = df.dropna(subset=['NUM_CPF', 'SAFRA'])

# 4.2 Remove missing > 75%
null_pct = df.isnull().mean()
cols_high_missing = null_pct[null_pct >= 0.75].index.tolist()
df = df.drop(columns=cols_high_missing)
print(f'High missing (>= 75%): {len(cols_high_missing)} colunas removidas')

# 4.3 Remove low cardinality (1 unico valor)
low_card = [c for c in df.columns if df[c].nunique() <= 1]
df = df.drop(columns=low_card)
print(f'Low cardinality (== 1): {len(low_card)} colunas removidas')

# 4.4 Remove leakage conhecidos
misused = ['PROD', 'flag_mig2', 'FAT_VLR_FPD', 'FAT_FLAG_MIG2_AQUISICAO']
existing_misused = [c for c in misused if c in df.columns]
if existing_misused:
    df = df.drop(columns=existing_misused)
    print(f'Leakage removidos: {existing_misused}')

print(f'Shape pos-limpeza: {df.shape}')

In [None]:
# =============================================================================
# 5. SPLIT TEMPORAL + PREPARACAO X/Y
# =============================================================================
NON_FEATURE_COLS = [
    'NUM_CPF', 'SAFRA', 'FPD', 'TARGET_SCORE_01', 'TARGET_SCORE_02',
    'DATADENASCIMENTO',
]

safras_train = [202410, 202411, 202412, 202501]
safras_oot = [202502, 202503]

df_train_full = df[df['SAFRA'].isin(safras_train)].copy()
df_oot_full = df[df['SAFRA'].isin(safras_oot)].copy()

# Drop NaN no target
df_train_full = df_train_full.dropna(subset=['FPD'])
df_oot_full = df_oot_full.dropna(subset=['FPD'])

# Amostra estratificada 25% para treino (acelera SHAP)
df_sample = df_train_full.groupby(['SAFRA', 'FPD'], group_keys=False).apply(
    lambda x: x.sample(frac=0.25, random_state=42)
).reset_index(drop=True)

# Separar X/y
feature_cols = [c for c in df_sample.columns if c not in NON_FEATURE_COLS]
X_train = df_sample[feature_cols].copy()
y_train = df_sample['FPD'].astype(int).copy()

X_oot = df_oot_full[feature_cols].copy()
y_oot = df_oot_full['FPD'].astype(int).copy()

# Liberar memoria
del df, df_train_full, df_oot_full
gc.collect()

print(f'X_train (25% sample): {X_train.shape}')
print(f'y_train FPD rate: {y_train.mean():.4f}')
print(f'X_oot: {X_oot.shape}')
print(f'y_oot FPD rate: {y_oot.mean():.4f}')
print(f'Features: {len(feature_cols)}')

In [None]:
# =============================================================================
# 6. CLASSIFICAR FEATURES POR BOOK (PREFIXO)
# =============================================================================
def classify_features_by_book(columns):
    """Classifica features por fonte/book baseado no prefixo."""
    groups = {'Base (Cadastro+Telco)': [], 'Recarga (REC_)': [],
              'Pagamento (PAG_)': [], 'Faturamento (FAT_)': []}
    for col in columns:
        if col.startswith('REC_'):
            groups['Recarga (REC_)'].append(col)
        elif col.startswith('PAG_'):
            groups['Pagamento (PAG_)'].append(col)
        elif col.startswith('FAT_'):
            groups['Faturamento (FAT_)'].append(col)
        else:
            groups['Base (Cadastro+Telco)'].append(col)
    return groups

feature_groups = classify_features_by_book(feature_cols)

# Mapa de cores por book
BOOK_COLORS = {
    'Base (Cadastro+Telco)': '#607D8B',
    'Recarga (REC_)': '#2196F3',
    'Pagamento (PAG_)': '#FF9800',
    'Faturamento (FAT_)': '#9C27B0',
}

def get_feature_color(feat_name):
    """Retorna cor baseada no prefixo da feature."""
    if feat_name.startswith('REC_') or '__REC_' in feat_name:
        return BOOK_COLORS['Recarga (REC_)']
    elif feat_name.startswith('PAG_') or '__PAG_' in feat_name:
        return BOOK_COLORS['Pagamento (PAG_)']
    elif feat_name.startswith('FAT_') or '__FAT_' in feat_name:
        return BOOK_COLORS['Faturamento (FAT_)']
    return BOOK_COLORS['Base (Cadastro+Telco)']

print('Feature Groups:')
for group, cols in feature_groups.items():
    print(f'  {group}: {len(cols)} features')

In [None]:
# =============================================================================
# 7. TREINAR LGBM BASELINE (TODAS AS FEATURES)
# =============================================================================
logger.info('Treinando LGBM baseline para SHAP...')

num_features = [n for n in X_train.select_dtypes(include=['int32','int64','float32','float64']).columns]
cat_features = [c for c in X_train.select_dtypes(include=['object','category']).columns]

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features),
], remainder='drop')

lgbm_model = LGBMClassifier(
    objective='binary', boosting_type='gbdt',
    learning_rate=0.05, n_estimators=300, max_depth=7,
    colsample_bytree=0.8, subsample=0.8,
    random_state=42, n_jobs=-1, verbosity=-1,
)

pipeline_lgbm = Pipeline([('prep', preprocessor), ('model', lgbm_model)])
pipeline_lgbm.fit(X_train, y_train)

# Metricas baseline
def ks_stat(y_true, y_score):
    pos = y_score[y_true == 1]
    neg = y_score[y_true == 0]
    return ks_2samp(pos, neg).statistic

scores_train = pipeline_lgbm.predict_proba(X_train)[:, 1]
scores_oot = pipeline_lgbm.predict_proba(X_oot)[:, 1]

auc_train = roc_auc_score(y_train, scores_train)
ks_train = ks_stat(y_train, scores_train)
auc_oot = roc_auc_score(y_oot, scores_oot)
ks_oot = ks_stat(y_oot, scores_oot)

print(f'LGBM Baseline (todas features):')
print(f'  Train — AUC: {auc_train:.4f}, KS: {ks_train:.4f}')
print(f'  OOT   — AUC: {auc_oot:.4f}, KS: {ks_oot:.4f}')
print(f'  Features: {len(num_features)} num + {len(cat_features)} cat = {len(num_features)+len(cat_features)}')

logger.info('LGBM baseline treinado')

In [None]:
# =============================================================================
# 8. SHAP TREE EXPLAINER
# =============================================================================
logger.info('Calculando SHAP values (TreeExplainer)...')

# Transformar X para formato pos-preprocessamento
X_train_transformed = pipeline_lgbm.named_steps['prep'].transform(X_train)

# Nomes das features pos-transformacao
try:
    transformed_names = pipeline_lgbm.named_steps['prep'].get_feature_names_out()
except Exception:
    transformed_names = num_features + cat_features

# SHAP TreeExplainer (rapido para tree-based models)
explainer = shap.TreeExplainer(pipeline_lgbm.named_steps['model'])
shap_values = explainer.shap_values(X_train_transformed)

# Para classificacao binaria, shap_values pode ser lista [class_0, class_1]
if isinstance(shap_values, list):
    shap_vals = shap_values[1]  # classe positiva (FPD=1)
else:
    shap_vals = shap_values

print(f'SHAP values shape: {shap_vals.shape}')
print(f'Features transformadas: {len(transformed_names)}')

logger.info('SHAP values calculados')

In [None]:
# =============================================================================
# 9. RANKING GLOBAL — mean(|SHAP|)
# =============================================================================
mean_abs_shap = np.abs(shap_vals).mean(axis=0)

df_shap_ranking = pd.DataFrame({
    'feature_transformed': list(transformed_names),
    'mean_abs_shap': mean_abs_shap,
}).sort_values('mean_abs_shap', ascending=False).reset_index(drop=True)

# Mapear de volta para nome original (remover prefixo num__/cat__)
df_shap_ranking['feature'] = df_shap_ranking['feature_transformed'].apply(
    lambda x: x.split('__', 1)[-1] if '__' in x else x
)

# Classificar por book
def get_book_label(feat):
    if feat.startswith('REC_'):
        return 'Recarga (REC_)'
    elif feat.startswith('PAG_'):
        return 'Pagamento (PAG_)'
    elif feat.startswith('FAT_'):
        return 'Faturamento (FAT_)'
    return 'Base (Cadastro+Telco)'

df_shap_ranking['book'] = df_shap_ranking['feature'].apply(get_book_label)

# Importancia cumulativa
total_shap = df_shap_ranking['mean_abs_shap'].sum()
df_shap_ranking['pct_importance'] = df_shap_ranking['mean_abs_shap'] / total_shap
df_shap_ranking['cumulative_pct'] = df_shap_ranking['pct_importance'].cumsum()
df_shap_ranking['rank'] = range(1, len(df_shap_ranking) + 1)

print('Top 30 Features (SHAP):')
print(df_shap_ranking[['rank', 'feature', 'book', 'mean_abs_shap', 'pct_importance', 'cumulative_pct']].head(30).to_string(index=False))

# Contribuicao por book
book_contribution = df_shap_ranking.groupby('book')['mean_abs_shap'].sum()
book_contribution_pct = (book_contribution / total_shap * 100).sort_values(ascending=False)

print(f'\nContribuicao por Book:')
for book, pct in book_contribution_pct.items():
    n_feats = len(df_shap_ranking[df_shap_ranking['book'] == book])
    print(f'  {book}: {pct:.1f}% ({n_feats} features)')

## Visualizacoes para Apresentacao

In [None]:
# =============================================================================
# 10. GRAFICO 1 — SHAP Summary Plot (Beeswarm) Global
# =============================================================================
fig, ax = plt.subplots(figsize=(12, 14))
shap.summary_plot(
    shap_vals, X_train_transformed,
    feature_names=list(transformed_names),
    max_display=40, show=False,
)
plt.title('SHAP Summary Plot — Top 40 Features (FPD)', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(f'{ARTIFACTS_DIR}/shap_summary_beeswarm.png', dpi=150, bbox_inches='tight')
plt.show()
print('Grafico 1 salvo: shap_summary_beeswarm.png')

In [None]:
# =============================================================================
# 11. GRAFICO 2 — SHAP Bar Plot Top 30 (colorido por Book)
# =============================================================================
top30 = df_shap_ranking.head(30)
colors = [get_feature_color(f) for f in top30['feature']]

fig, ax = plt.subplots(figsize=(12, 10))
bars = ax.barh(
    range(len(top30) - 1, -1, -1),
    top30['mean_abs_shap'].values,
    color=colors, alpha=0.85, edgecolor='white', linewidth=0.5,
)
ax.set_yticks(range(len(top30) - 1, -1, -1))
ax.set_yticklabels(top30['feature'].values, fontsize=9)
ax.set_xlabel('mean(|SHAP value|)', fontsize=11)
ax.set_title('Top 30 Features por Importancia SHAP — Colorido por Book', fontsize=14, fontweight='bold')

# Anotar percentual cumulativo
for i, (_, row) in enumerate(top30.iterrows()):
    ax.text(row['mean_abs_shap'] + max(top30['mean_abs_shap']) * 0.01,
            len(top30) - 1 - i,
            f"{row['pct_importance']:.1%}",
            va='center', fontsize=8, color='#333')

# Legenda
legend_elements = [Patch(facecolor=c, label=l) for l, c in BOOK_COLORS.items()]
ax.legend(handles=legend_elements, loc='lower right', fontsize=10)
ax.grid(True, alpha=0.2, axis='x')

plt.tight_layout()
plt.savefig(f'{ARTIFACTS_DIR}/shap_top30_by_book.png', dpi=150, bbox_inches='tight')
plt.show()
print('Grafico 2 salvo: shap_top30_by_book.png')

In [None]:
# =============================================================================
# 12. GRAFICO 3 — Top 15 Features por Book (4 subplots)
# =============================================================================
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Top 15 Features por Book — Importancia SHAP', fontsize=16, fontweight='bold')

book_names = ['Base (Cadastro+Telco)', 'Recarga (REC_)', 'Pagamento (PAG_)', 'Faturamento (FAT_)']

for idx, book_name in enumerate(book_names):
    ax = axes[idx // 2][idx % 2]
    book_data = df_shap_ranking[df_shap_ranking['book'] == book_name].head(15)

    if book_data.empty:
        ax.set_title(f'{book_name} — sem features')
        continue

    color = BOOK_COLORS[book_name]
    ax.barh(
        range(len(book_data) - 1, -1, -1),
        book_data['mean_abs_shap'].values,
        color=color, alpha=0.85, edgecolor='white', linewidth=0.5,
    )
    ax.set_yticks(range(len(book_data) - 1, -1, -1))
    ax.set_yticklabels(book_data['feature'].values, fontsize=8)
    ax.set_xlabel('mean(|SHAP value|)', fontsize=9)

    total_book = df_shap_ranking[df_shap_ranking['book'] == book_name]['mean_abs_shap'].sum()
    pct_global = total_book / total_shap * 100
    n_total = len(df_shap_ranking[df_shap_ranking['book'] == book_name])
    ax.set_title(f'{book_name}\n{n_total} features | {pct_global:.1f}% da importancia global',
                fontsize=11, fontweight='bold')
    ax.grid(True, alpha=0.2, axis='x')

    # Anotar valores
    for i, (_, row) in enumerate(book_data.iterrows()):
        ax.text(row['mean_abs_shap'] + max(book_data['mean_abs_shap']) * 0.02,
                len(book_data) - 1 - i,
                f"{row['mean_abs_shap']:.4f}",
                va='center', fontsize=7, color='#555')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.savefig(f'{ARTIFACTS_DIR}/shap_top15_per_book.png', dpi=150, bbox_inches='tight')
plt.show()
print('Grafico 3 salvo: shap_top15_per_book.png')

In [None]:
# =============================================================================
# 13. GRAFICO 4 — Contribuicao Agregada por Book (Stacked Bar)
# =============================================================================
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Contribuicao Agregada por Book', fontsize=14, fontweight='bold')

# 13a. Pie chart
ax1 = axes[0]
book_pcts = []
book_labels = []
book_colors_list = []
for book_name in book_names:
    total_book = df_shap_ranking[df_shap_ranking['book'] == book_name]['mean_abs_shap'].sum()
    pct = total_book / total_shap * 100
    book_pcts.append(pct)
    n = len(df_shap_ranking[df_shap_ranking['book'] == book_name])
    book_labels.append(f'{book_name}\n({n} vars, {pct:.1f}%)')
    book_colors_list.append(BOOK_COLORS[book_name])

wedges, texts, autotexts = ax1.pie(
    book_pcts, labels=None, colors=book_colors_list,
    autopct='%1.1f%%', startangle=90, pctdistance=0.7,
    textprops={'fontsize': 11, 'fontweight': 'bold'},
)
ax1.legend(book_labels, loc='center left', bbox_to_anchor=(-0.3, 0.5), fontsize=9)
ax1.set_title('% Importancia SHAP por Book', fontsize=12)

# 13b. Bar chart com top N features por book
ax2 = axes[1]
bottom = np.zeros(1)
for book_name in book_names:
    total_book = df_shap_ranking[df_shap_ranking['book'] == book_name]['mean_abs_shap'].sum()
    ax2.bar(0, total_book, bottom=bottom, color=BOOK_COLORS[book_name],
           label=book_name, edgecolor='white', linewidth=0.5, width=0.5)
    # Anotar no meio da barra
    mid = bottom[0] + total_book / 2
    pct = total_book / total_shap * 100
    if pct > 5:
        ax2.text(0, mid, f'{pct:.1f}%', ha='center', va='center',
                fontsize=11, fontweight='bold', color='white')
    bottom += total_book

ax2.set_ylabel('Sum mean(|SHAP|)', fontsize=11)
ax2.set_title('Importancia SHAP Empilhada por Book', fontsize=12)
ax2.set_xticks([0])
ax2.set_xticklabels(['Todas Features'])
ax2.legend(loc='upper right', fontsize=9)
ax2.grid(True, alpha=0.2, axis='y')

plt.tight_layout()
plt.savefig(f'{ARTIFACTS_DIR}/shap_book_contribution.png', dpi=150, bbox_inches='tight')
plt.show()
print('Grafico 4 salvo: shap_book_contribution.png')

In [None]:
# =============================================================================
# 14. GRAFICO 5 — SHAP Dependence Plots (Top 3 features)
# =============================================================================
top3_features = df_shap_ranking.head(3)['feature_transformed'].tolist()
top3_names = df_shap_ranking.head(3)['feature'].tolist()

fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('SHAP Dependence Plots — Top 3 Features', fontsize=14, fontweight='bold')

for i, (feat_trans, feat_name) in enumerate(zip(top3_features, top3_names)):
    ax = axes[i]
    feat_idx = list(transformed_names).index(feat_trans)
    shap.dependence_plot(
        feat_idx, shap_vals, X_train_transformed,
        feature_names=list(transformed_names),
        ax=ax, show=False,
    )
    color = get_feature_color(feat_name)
    ax.set_title(f'{feat_name}', fontsize=11, fontweight='bold', color=color)

plt.tight_layout(rect=[0, 0, 1, 0.94])
plt.savefig(f'{ARTIFACTS_DIR}/shap_dependence_top3.png', dpi=150, bbox_inches='tight')
plt.show()
print('Grafico 5 salvo: shap_dependence_top3.png')

In [None]:
# =============================================================================
# 15. GRAFICO 6 — Curva Cumulativa de Importancia (Pareto)
# =============================================================================
fig, ax = plt.subplots(figsize=(14, 6))

x_range = range(1, len(df_shap_ranking) + 1)
colors_cum = [get_feature_color(f) for f in df_shap_ranking['feature']]

ax.bar(x_range, df_shap_ranking['pct_importance'].values, color=colors_cum, alpha=0.7, width=1.0)
ax2 = ax.twinx()
ax2.plot(x_range, df_shap_ranking['cumulative_pct'].values, color='red', linewidth=2, label='Cumulativo')
ax2.axhline(y=0.90, color='red', linestyle='--', alpha=0.5, label='90% threshold')

# Encontrar ponto de corte 90%
n_90 = (df_shap_ranking['cumulative_pct'] <= 0.90).sum()
ax2.axvline(x=n_90, color='green', linestyle='--', alpha=0.7)
ax2.annotate(f'{n_90} features\n= 90% importancia',
            xy=(n_90, 0.90), xytext=(n_90 + 20, 0.80),
            arrowprops=dict(arrowstyle='->', color='green'),
            fontsize=11, fontweight='bold', color='green')

ax.set_xlabel('Feature (rank por SHAP importance)', fontsize=11)
ax.set_ylabel('Importancia Individual (%)', fontsize=11)
ax2.set_ylabel('Importancia Cumulativa (%)', fontsize=11)
ax.set_title(f'Curva de Pareto — {n_90} features capturam 90% da importancia SHAP',
            fontsize=14, fontweight='bold')

# Legenda
legend_elements = [Patch(facecolor=c, label=l) for l, c in BOOK_COLORS.items()]
from matplotlib.lines import Line2D
legend_elements.append(Line2D([0], [0], color='red', linewidth=2, label='Cumulativo'))
ax.legend(handles=legend_elements, loc='center right', fontsize=9)

plt.tight_layout()
plt.savefig(f'{ARTIFACTS_DIR}/shap_pareto_cumulative.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Grafico 6 salvo: shap_pareto_cumulative.png')
print(f'\n>>> {n_90} features capturam 90% da importancia SHAP total')

## Selecao e Export

In [None]:
# =============================================================================
# 16. SELECAO FINAL — TOP FEATURES (90% CUMULATIVO)
# =============================================================================
CUMULATIVE_THRESHOLD = 0.90

selected_mask = df_shap_ranking['cumulative_pct'] <= CUMULATIVE_THRESHOLD
# Incluir a feature que cruza o threshold (para garantir >= 90%)
if not selected_mask.all():
    first_over = selected_mask[~selected_mask].index[0]
    selected_mask.iloc[:first_over + 1] = True

df_selected = df_shap_ranking[selected_mask].copy()
selected_features_shap = df_selected['feature'].unique().tolist()

print(f'Features selecionadas (SHAP >= {CUMULATIVE_THRESHOLD:.0%}): {len(selected_features_shap)}')
print(f'Importancia cumulativa capturada: {df_selected["pct_importance"].sum():.1%}')

# Breakdown por book
print(f'\nBreakdown por book:')
for book_name in book_names:
    book_sel = df_selected[df_selected['book'] == book_name]
    book_all = df_shap_ranking[df_shap_ranking['book'] == book_name]
    print(f'  {book_name}: {len(book_sel)} / {len(book_all)} selecionadas')

In [None]:
# =============================================================================
# 17. VALIDACAO — TREINAR LGBM COM FEATURES SELECIONADAS E COMPARAR
# =============================================================================
logger.info('Validando com features selecionadas...')

# Features selecionadas que existem no X_train
selected_in_X = [f for f in selected_features_shap if f in X_train.columns]

X_train_sel = X_train[selected_in_X]
X_oot_sel = X_oot[selected_in_X]

num_sel = [n for n in X_train_sel.select_dtypes(include=['int32','int64','float32','float64']).columns]
cat_sel = [c for c in X_train_sel.select_dtypes(include=['object','category']).columns]

num_pipe_sel = Pipeline([('imputer', SimpleImputer(strategy='median'))])
cat_pipe_sel = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', CountEncoder(normalize=True, handle_unknown=0, handle_missing=0)),
])
prep_sel = ColumnTransformer([
    ('num', num_pipe_sel, num_sel),
    ('cat', cat_pipe_sel, cat_sel),
], remainder='drop')

lgbm_sel = LGBMClassifier(
    objective='binary', boosting_type='gbdt',
    learning_rate=0.05, n_estimators=300, max_depth=7,
    colsample_bytree=0.8, subsample=0.8,
    random_state=42, n_jobs=-1, verbosity=-1,
)
pipe_sel = Pipeline([('prep', prep_sel), ('model', lgbm_sel)])
pipe_sel.fit(X_train_sel, y_train)

scores_train_sel = pipe_sel.predict_proba(X_train_sel)[:, 1]
scores_oot_sel = pipe_sel.predict_proba(X_oot_sel)[:, 1]

auc_train_sel = roc_auc_score(y_train, scores_train_sel)
ks_train_sel = ks_stat(y_train, scores_train_sel)
auc_oot_sel = roc_auc_score(y_oot, scores_oot_sel)
ks_oot_sel = ks_stat(y_oot, scores_oot_sel)

# Tabela comparativa
n_all = len(num_features) + len(cat_features)
n_sel = len(selected_in_X)

print(f'\n{"="*70}')
print(f' COMPARACAO: TODAS FEATURES vs SHAP SELECIONADAS')
print(f'{"="*70}')
print(f'{"":20s} {"Todas (" + str(n_all) + ")":>18s} {"SHAP (" + str(n_sel) + ")":>18s} {"Delta":>10s}')
print(f'{"-"*70}')
print(f'{"AUC Train":20s} {auc_train:18.4f} {auc_train_sel:18.4f} {auc_train_sel - auc_train:+10.4f}')
print(f'{"KS Train":20s} {ks_train:18.4f} {ks_train_sel:18.4f} {ks_train_sel - ks_train:+10.4f}')
print(f'{"AUC OOT":20s} {auc_oot:18.4f} {auc_oot_sel:18.4f} {auc_oot_sel - auc_oot:+10.4f}')
print(f'{"KS OOT":20s} {ks_oot:18.4f} {ks_oot_sel:18.4f} {ks_oot_sel - ks_oot:+10.4f}')
print(f'{"Gini OOT":20s} {(2*auc_oot-1):18.4f} {(2*auc_oot_sel-1):18.4f} {(2*auc_oot_sel-1)-(2*auc_oot-1):+10.4f}')
print(f'{"="*70}')
print(f'Reducao de features: {n_all} -> {n_sel} ({(1-n_sel/n_all)*100:.0f}% menos)')

logger.info('Validacao concluida')

In [None]:
# =============================================================================
# 18. EXPORT — PICKLE + CSV + MLFLOW
# =============================================================================

# 18.1 Salvar lista de features como pickle
pkl_path = f'{ARTIFACTS_DIR}/selected_features_shap.pkl'
with open(pkl_path, 'wb') as f:
    pickle.dump(selected_features_shap, f)
print(f'Features pickle salvo: {pkl_path}')

# 18.2 Salvar ranking completo como CSV
csv_path = f'{ARTIFACTS_DIR}/shap_feature_ranking.csv'
df_shap_ranking.to_csv(csv_path, index=False)
print(f'Ranking CSV salvo: {csv_path}')

# 18.3 Salvar features selecionadas como CSV
csv_sel_path = f'{ARTIFACTS_DIR}/selected_features_list.csv'
df_selected[['rank', 'feature', 'book', 'mean_abs_shap', 'pct_importance', 'cumulative_pct']].to_csv(csv_sel_path, index=False)
print(f'Selected CSV salvo: {csv_sel_path}')

# 18.4 Log no MLflow
with mlflow.start_run(run_name='SHAP_Feature_Selection') as run:
    mlflow.set_tag('task', 'feature_selection')
    mlflow.set_tag('method', 'SHAP_TreeExplainer')
    mlflow.log_param('n_features_total', n_all)
    mlflow.log_param('n_features_selected', n_sel)
    mlflow.log_param('cumulative_threshold', CUMULATIVE_THRESHOLD)
    mlflow.log_param('reduction_pct', f'{(1-n_sel/n_all)*100:.0f}%')

    # Metricas baseline vs selecionado
    mlflow.log_metric('baseline_auc_oot', auc_oot)
    mlflow.log_metric('baseline_ks_oot', ks_oot)
    mlflow.log_metric('selected_auc_oot', auc_oot_sel)
    mlflow.log_metric('selected_ks_oot', ks_oot_sel)
    mlflow.log_metric('delta_auc_oot', auc_oot_sel - auc_oot)
    mlflow.log_metric('delta_ks_oot', ks_oot_sel - ks_oot)

    # Contribuicao por book
    for book_name in book_names:
        total_book = df_shap_ranking[df_shap_ranking['book'] == book_name]['mean_abs_shap'].sum()
        pct = total_book / total_shap
        safe_name = book_name.replace(' ', '_').replace('(', '').replace(')', '').replace('+', '')
        mlflow.log_metric(f'book_pct_{safe_name}', round(pct, 4))

    # Artefatos
    for fname in os.listdir(ARTIFACTS_DIR):
        fpath = f'{ARTIFACTS_DIR}/{fname}'
        if fname.endswith('.png'):
            mlflow.log_artifact(fpath, 'shap_plots')
        else:
            mlflow.log_artifact(fpath, 'feature_selection')

    mlflow.sklearn.log_model(pipe_sel, 'model_lgbm_shap_selected')

    run_id = run.info.run_id

print(f'\nMLflow Run ID: {run_id}')
print(f'Artefatos logados: {len(os.listdir(ARTIFACTS_DIR))} arquivos')

## Conclusoes

### Resultados

- SHAP TreeExplainer identificou as features com maior contribuicao real para predizer FPD
- A selecao por cumulativo 90% reduz significativamente o numero de features
  sem perda relevante de performance (Delta KS e AUC devem ser proximos de zero)

### Proximos Passos

1. Importar `selected_features_shap.pkl` no `modelo_baseline_v3.ipynb`
2. Re-treinar modelo final com features SHAP-selecionadas
3. Comparar KS/AUC vs selecao IV+L1+corr atual
4. Usar graficos por book na apresentacao (slide de Feature Importance)

### Graficos Gerados

| Arquivo | Descricao |
|---------|----------|
| `shap_summary_beeswarm.png` | Beeswarm plot global — direcao e magnitude |
| `shap_top30_by_book.png` | Top 30 features coloridas por book |
| `shap_top15_per_book.png` | Top 15 de cada book (4 subplots) |
| `shap_book_contribution.png` | Pie + stacked bar de contribuicao por book |
| `shap_dependence_top3.png` | Dependence plots das 3 features mais importantes |
| `shap_pareto_cumulative.png` | Curva de Pareto com ponto de corte 90% |