
# Modelo Baseline de Risco – Telecom

Este notebook implementa um **modelo baseline de risco de inadimplência**, considerando o contexto de uma grande empresa de telecomunicacao.

Abaixo, estao descritas as principais atividades implementadas para o desenvolvimento do modelo de risco de inadimplência.

### Principais etapas aplicados na modelagem:
- Amostragem de **25% por safra e FPD**, de forma estratificada 
- Separação em **Treino (2024-10 a 2024-12) e Validacao (2025-01) (Samples) X OOS (202-10 a 2025-01) X OOT (2025-02 e 2025-03)**
- Modelos testados inicialmente: Logistic Regression e LightGBM
- Ajuste de hiperparâmetros apenas no conjunto de validacao
- Treinos finais dos modelos considerando bases de treino e validacao (Safras 2024-10 a 2025-01)
- Metricas utilizadas na analise: AUC, KS, Precisao, Recall, F1-Score
- Avaliação por safra (AUC e KS) -> Safras OOT1 (2025-02) e OOT2 (2025-03) e OOT Consolidada
- Avaliação por safra tambem nas bases de treino e OOS, paea fins de validacao do processo
- Analise da evolucao do KS com a inclusao incremental das variaveis de cada book/fonte
- Uso do benchmark de 33.1% de KS como comparativo para performance do modelos nas safras OOT
- Análise **Swap-in / Swap-out** e quantificacao de ganhos esperados com o modelo


In [None]:
# Instalacao de pacote para uso de Target/CountEncoder
%pip install scikit-learn==1.3.2
%pip install category-encoders==2.6.3

In [None]:
# Importacao de bibliotecas
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
from datetime import date
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import (
    GridSearchCV, 
    StratifiedKFold
)
from pyspark.sql import Window, Row
from pyspark.sql.types import DateType, TimestampType
from pyspark.sql import functions as F
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer
from category_encoders import CountEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix

# Config centralizado do pipeline
import sys; sys.path.insert(0, "/lakehouse/default/Files/projeto-final")
from config.pipeline_config import EXPERIMENT_NAME, SAFRAS

In [None]:
# Removendo warnings
warnings.filterwarnings("ignore")

In [None]:
# Configuracao do MLflow experiment tracking
# EXPERIMENT_NAME importado de config.pipeline_config
mlflow.set_experiment(EXPERIMENT_NAME)

# Habilitando autolog para sklearn e lightgbm
mlflow.autolog(
    log_models=True,
    log_input_examples=False,
    log_model_signatures=True,
    silent=True
)

print(f"MLflow experiment configurado: {EXPERIMENT_NAME}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

## Processo completo - (Spark)

In [None]:
# Remove registros em que chaves (CPF e SAFRA) estao vazios
def clean_empty_keys_data(df):
    df = (
        df
        .filter(F.col("SAFRA").isNotNull())
        .filter(F.col("NUM_CPF").isNotNull())   
    )
    return df

In [None]:
# Remove colunas com alto percentual de valores faltantes
def remove_high_missing_values(df, percent_missing=0.75):
    total_rows = df.count()

    null_ratio = (
        df
        .select([
            (F.sum(F.col(c).isNull().cast("int")) / total_rows).alias(c)
            for c in df.columns
        ])
        .collect()[0]
        .asDict()
    )

    cols_drop_nulls = [c for c, v in null_ratio.items() if v >= percent_missing]
    df = df.drop(*cols_drop_nulls)
    return df, null_ratio

In [None]:
# Remove colunas com baixa cardinalidade
def remove_low_cardinality_values(df):
    low_card_cols = [
        c for c in df.columns
        if df.select(c).distinct().count() == 1
    ]

    df = df.drop(*low_card_cols)
    return df,low_card_cols

In [None]:
# Funcao que faz a amostragem estratificada (SAFRA, FPD) de um df_spark de acordo com porcao definida em "percent"
def split_stratified_data(df, percent=0.25):
    window = Window.partitionBy("SAFRA", "FPD").orderBy(F.rand(seed=42))

    df = df.withColumn(
        "rn", F.row_number().over(window)
    )

    counts = (
        df
        .groupBy("SAFRA", "FPD")
        .count()
        .withColumn("cutoff", (F.col("count") * percent).cast("int"))
    )

    df = df.join(counts, ["SAFRA", "FPD"], "left")

    # Separando registros entre base amostrada e out-of-sample (teste)
    df_sample = df.filter(F.col("rn") <= F.col("cutoff"))
    df_oos = df.filter(F.col("rn") > F.col("cutoff"))

    # Removendo colunas auxiliares geradas
    df_sample = df_sample.drop("rn", "count", "cutoff")
    df_oos = df_oos.drop("rn", "count", "cutoff")

    return df_sample, df_oos

In [None]:
# Funcao que ordena dados das safras
def sort_periods(df):
    safras_ord = (
        df
        .select("SAFRA")
        .distinct()
        .orderBy("SAFRA")
        .rdd.flatMap(lambda x: x)
        .collect()
    )
    return safras_ord

In [None]:
# Remove colunas com alta correlacao (acima de valor definido em thresh)
def remove_high_correlation_data(df, thresh=0.8):

    # Selecionando dados das 4 primeiras safras apenas (Dados das safras de treino + validacao, sem OOT)
    safras_ord = sort_periods(df)[:4]
    df_corr_base = df.filter(F.col("SAFRA").isin(safras_ord))

    # Selecionando variaveis numericas
    num_cols_corr = [
        c for c, t in df_corr_base.dtypes
        if t in ("int", "bigint", "double", "float")
        and c not in ["FPD"]
    ]

    df_corr_sample, _ = split_stratified_data(df_corr_base)
    pdf_corr = df_corr_sample.select(num_cols_corr).toPandas()
    corr_matrix = pdf_corr.corr().abs()

    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )

    to_drop = []
    while True:
        max_corr = upper.max().max()
        if max_corr < thresh:
            break

        col_to_drop = (
            upper.max()
            .sort_values(ascending=False)
            .index[0]
        )

        to_drop.append(col_to_drop)

        upper = upper.drop(index=col_to_drop, columns=col_to_drop)
    
    df = df.drop(*to_drop)
    return df, to_drop

In [None]:
# Funcao para criacao de colunas numericas (datediff) baseadas nas de datas, e posteriormente a remocao destas
def adjust_and_drop_date_cols(df):
    # Transformando "var_12" em tipo data
    df = df.withColumn(
        "var_12",
        F.to_timestamp("var_12", "dd/MM/yyyy")
    )

    # Verificando colunas do tipo date/datetime
    date_cols = [
        f.name
        for f in df.schema.fields
        if isinstance(f.dataType, (DateType, TimestampType))
    ]

    # Criacao de coluna com data de ref baseada no valor da safra
    df = df.withColumn(
        "DATA_REF_SAFRA",
        F.to_date(F.col("SAFRA").cast("string"), "yyyyMM")
    )

    # Inclusao de novos colunas considerando datediff de variaveis em formato datetime ate entao
    df = df.withColumn("DIAS_VAR_12", F.datediff(F.col("DATA_REF_SAFRA"), F.col("var_12")))
    df = df.withColumn("PAG_DIAS_DESDE_PRIMEIRA_FATURA", F.datediff(F.col("DATA_REF_SAFRA"), F.col("PAG_DT_PRIMEIRA_FATURA")))

    date_cols.append("DATA_REF_SAFRA")
    df = df.drop(*date_cols)
    return df, date_cols

In [None]:
# Remocao de colunas que nao devem ser usadas na modelagem
# FAT_VLR_FPD removido do book_faturamento (leakage fix Story 1.1)
# FAT_FLAG_MIG2_AQUISICAO removido do book_faturamento (duplicata)
# Mantemos drop defensivo caso notebook rode com feature store antigo
def remove_other_misused_columns(df):
    misused_columns = ["PROD", "flag_mig2", "FAT_VLR_FPD", "FAT_FLAG_MIG2_AQUISICAO"]
    existing = [c for c in misused_columns if c in df.columns]
    if existing:
        df = df.drop(columns=existing)
    return df

In [None]:
cep_uf_regiao = [
    # NORTE
    ("69", "AM", "NORTE"), ("68", "AC", "NORTE"), ("66", "PA", "NORTE"),
    ("65", "PA", "NORTE"), ("67", "RO", "NORTE"), ("77", "TO", "NORTE"),
    ("78", "MT", "NORTE"), ("79", "RR", "NORTE"),

    # NORDESTE
    ("60", "CE", "NORDESTE"), ("61", "DF", "CENTRO-OESTE"),  # DF tratado aqui
    ("62", "PI", "NORDESTE"), ("63", "TO", "NORTE"),
    ("64", "PI", "NORDESTE"), ("65", "MA", "NORDESTE"),
    ("66", "PA", "NORTE"), ("67", "RO", "NORTE"),
    ("68", "AC", "NORTE"), ("69", "AM", "NORTE"),

    ("40", "BA", "NORDESTE"), ("41", "BA", "NORDESTE"),
    ("42", "BA", "NORDESTE"), ("43", "BA", "NORDESTE"),
    ("44", "BA", "NORDESTE"), ("45", "BA", "NORDESTE"),
    ("46", "BA", "NORDESTE"), ("47", "BA", "NORDESTE"),
    ("48", "BA", "NORDESTE"),

    ("50", "PE", "NORDESTE"), ("51", "PE", "NORDESTE"),
    ("52", "PE", "NORDESTE"), ("53", "PE", "NORDESTE"),
    ("54", "PE", "NORDESTE"), ("55", "PE", "NORDESTE"),

    ("56", "AL", "NORDESTE"), ("57", "AL", "NORDESTE"),
    ("58", "PB", "NORDESTE"), ("59", "PB", "NORDESTE"),

    ("20", "RJ", "SUDESTE"), ("21", "RJ", "SUDESTE"),
    ("22", "RJ", "SUDESTE"), ("23", "RJ", "SUDESTE"),
    ("24", "RJ", "SUDESTE"),

    ("30", "MG", "SUDESTE"), ("31", "MG", "SUDESTE"),
    ("32", "MG", "SUDESTE"), ("33", "MG", "SUDESTE"),
    ("34", "MG", "SUDESTE"), ("35", "MG", "SUDESTE"),
    ("36", "MG", "SUDESTE"), ("37", "MG", "SUDESTE"),
    ("38", "MG", "SUDESTE"), ("39", "MG", "SUDESTE"),

    ("01", "SP", "SUDESTE"), ("02", "SP", "SUDESTE"),
    ("03", "SP", "SUDESTE"), ("04", "SP", "SUDESTE"),
    ("05", "SP", "SUDESTE"), ("06", "SP", "SUDESTE"),
    ("07", "SP", "SUDESTE"), ("08", "SP", "SUDESTE"),
    ("09", "SP", "SUDESTE"),

    # SUL
    ("80", "PR", "SUL"), ("81", "PR", "SUL"),
    ("82", "PR", "SUL"), ("83", "PR", "SUL"),
    ("84", "PR", "SUL"), ("85", "PR", "SUL"),
    ("86", "PR", "SUL"), ("87", "PR", "SUL"),
    ("88", "SC", "SUL"),
    ("89", "SC", "SUL"),
    ("90", "RS", "SUL"),
    ("91", "RS", "SUL"),
    ("92", "RS", "SUL"),
    ("93", "RS", "SUL"),
    ("94", "RS", "SUL"),
    ("95", "RS", "SUL"),
    ("96", "RS", "SUL"),
    ("97", "RS", "SUL"),
    ("98", "RS", "SUL"),
    ("99", "RS", "SUL"),

    # CENTRO-OESTE
    ("70", "DF", "CENTRO-OESTE"), ("71", "DF", "CENTRO-OESTE"),
    ("72", "DF", "CENTRO-OESTE"), ("73", "DF", "CENTRO-OESTE"),
    ("74", "DF", "CENTRO-OESTE"), ("75", "GO", "CENTRO-OESTE"),
    ("76", "GO", "CENTRO-OESTE"),
    ("78", "MT", "CENTRO-OESTE"),
    ("79", "MS", "CENTRO-OESTE"),
]

In [None]:
# Funcao que converte "CEP3" em UF e Regiao
def convert_cep3_uf_regiao(df):
    df = df.withColumn(
        "CEP_2",
        F.col("CEP_3_digitos").substr(1, 2)
    )

    df_cep_map = spark.createDataFrame(
        cep_uf_regiao,
        ["CEP_2", "UF", "REGIAO"]
    )

    df = (
        df
        .join(df_cep_map, on="CEP_2", how="left")
    )

    df = (
        df
        .withColumn("UF", F.coalesce(F.col("UF"), F.lit("OUTROS")))
        .withColumn("REGIAO", F.coalesce(F.col("REGIAO"), F.lit("OUTROS")))
    )

    df = df.drop("CEP_3_digitos", "CEP_2")
    return df


In [None]:
# Funcao de agregacao de todas as transformacoes/limpezas aplicadas nas bases para modelagem/avaliacao
def apply_cleanings_to_df(df):
    
    df = clean_empty_keys_data(df)
    df = convert_cep3_uf_regiao(df)
    df, date_cols = adjust_and_drop_date_cols(df)
    df, high_missing = remove_high_missing_values(df)
    df, low_card = remove_low_cardinality_values(df)
    df, high_corr = remove_high_correlation_data(df)
    df = remove_other_misused_columns(df)

    return df

In [None]:
# Leitura dos dados
df_spark = spark.sql("SELECT * FROM Gold.feature_store.clientes_consolidado")

# Chamar aqui funcao apply cleanings no df_spark_total
df_spark_clean = apply_cleanings_to_df(df_spark)

# Filtragem de clientes que contrataram pacote pos -> Adicionar isso apos a aplicacao de transformacao
df_spark_clean_clientes_pos = df_spark_clean.filter(F.col("FLAG_INSTALACAO") == 1)
df_spark_clean_clientes_reprovados = df_spark_clean.filter(F.col("FLAG_INSTALACAO") == 0)

# Remocao da coluna "FLAG_INSTALACAO", pois modelos nao utilizarao (variavel apenas para segmentar situacao de clientes)
df_spark_clean_clientes_pos = df_spark_clean_clientes_pos.drop("FLAG_INSTALACAO")
df_spark_clean_clientes_reprovados = df_spark_clean_clientes_reprovados.drop("FLAG_INSTALACAO")

# Separacao das bases de dados (Sample (Treino + Val posteriormente), Teste (OOS) e Out-of-Time (OOT))
list_safras = sort_periods(df_spark_clean_clientes_pos)

# Gerando lista das safras para treino/teste e OOT
safras_train_oos = list_safras[:4]
safras_oot = list_safras[4:]

# Dataframe contendo todos os dados limpos das 4 primeiras safras
df_4_safras = df_spark_clean_clientes_pos.filter(F.col("SAFRA").isin(safras_train_oos))

# Split estratificada das bases de treino (+ val) e teste (Out of Sample)
df_sample_spark, df_oos_spark = split_stratified_data(df_4_safras, percent=0.25)

# Definicao de dataframe com dados do conjunto OOT (Out of Time)
df_oot_spark = df_spark_clean_clientes_pos.filter(F.col("SAFRA").isin(safras_oot))

# Conversao dos dataframes spark para pandas
df_sample = df_sample_spark.toPandas()
df_oos = df_oos_spark.toPandas()
df_oot = df_oot_spark.toPandas()

print("Shapes finais:")
print("Sample:", df_sample.shape)
print("OOS:", df_oos.shape)
print("OOT:", df_oot.shape)

In [None]:
# Checagem de informacoes de volumetria e tipo das colunas das bases
print("Info df Sample: ")
display(df_sample.info())
print("\n")
print("Info df OOS: ")
display(df_oos.info())
print("\n")
print("Info df OOT: ")
display(df_oot.info())

In [None]:
# Remocao de eventuais duplicatas das bases
df_sample = df_sample.drop_duplicates()
df_oos = df_oos.drop_duplicates()
df_oot = df_oot.drop_duplicates()

In [None]:
# Verificando contagem de registros por safra e target (FPD) - Sample train + val
df_sample[["SAFRA","FPD"]].value_counts().to_frame().sort_values(by='SAFRA')

In [None]:
# Verificando contagem de registros por safra e target (FPD) - Teste (OOS)
df_oos[["SAFRA","FPD"]].value_counts().to_frame().sort_values(by='SAFRA')

In [None]:
# Verificando contagem de registros por safra e target (FPD) - OOT
df_oot[["SAFRA","FPD"]].value_counts().to_frame().sort_values(by='SAFRA')

## 3. Separação da base amostrada em Treino x Validacao X OOT

#### Verificar necessidade de blocos de codigo abaixo

In [None]:
safras_train = [202410, 202411, 202412]
safras_val = [202501]

# Divisao dos dataframes
df_train = df_sample[df_sample["SAFRA"].isin(safras_train)]
df_val = df_sample[df_sample["SAFRA"].isin(safras_val)]

# Removendo coluna FPD (target) das features, e gerando y com FPD para cada uma das bases
X_train = df_train.drop(columns=["FPD"])
y_train = df_train["FPD"]

X_val = df_val.drop(columns=["FPD"])
y_val = df_val["FPD"]

# Para treinamento do modelo considerando as bases train + val (enriquecimento do modelo)
X_train_final = pd.concat([X_train, X_val], axis=0)
y_train_final = pd.concat([y_train, y_val], axis=0)

# Preparacao da base OOS
X_oos_agg = df_oos.drop(columns=["FPD"])
y_oos_agg = df_oos["FPD"]

# Preparacao da base OOT
X_oot_agg = df_oot.drop(columns=["FPD"])
y_oot_agg = df_oot["FPD"]


## 4. Separação de Variáveis por tipos

In [None]:
num_features = [n for n in X_train.select_dtypes(include=["int32", "int64", "float32", "float64"]).columns if n != "SAFRA"]
cat_features = [c for c in X_train.select_dtypes(include=["object", "category"]).columns if c != "NUM_CPF"]

## 5. Montagem dos Pipelines sklearn

#### Pipeline Regressao Logistica

In [None]:
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", CountEncoder(normalize=True))
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_features),
    ("cat", categorical_pipe, cat_features)
])

pipeline_RL = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(
        solver="liblinear",
        penalty="l1",
        max_iter=2000,
        tol=1e-3,
        class_weight="balanced",
        random_state=42
    ))
])


#### Pipeline LightGBM

In [None]:
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", CountEncoder(
        normalize=True,
        handle_unknown=0,
        handle_missing=0
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_features),
        ("cat", categorical_pipe, cat_features),
    ],
    remainder="drop"
)

lgbm = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    learning_rate=0.05,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

pipeline_LGBM = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", lgbm)
])

## 6. Ajuste de Hiperparâmetros

#### Grid Search e Treino com HPs escolhidos (Treino + Val) - Regressao Logistica

In [None]:
param_grid_RL = {
    "model__C": [0.01, 0.05, 0.1, 0.5],
}

cv = StratifiedKFold(
    n_splits=4,
    shuffle=True,
    random_state=42
)

grid_RL = GridSearchCV(
    pipeline_RL,
    param_grid=param_grid_RL,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=3
)

# Aplicando fit no conjunto de validacao (para selecao de hiperparametros)
grid_RL.fit(X_val, y_val)


In [None]:
# Exibicao dos melhores hiper-parametros e scores - Ajuste na base de validacao (safra 2025-01)
print("Melhores hiper-parâmetros RL:", grid_RL.best_params_)
print("Melhor score (AUC) RL: ", grid_RL.best_score_)

In [None]:
# Setando melhores hiperparametros encontrados no grid search ao modelo
pipeline_RL.set_params(**grid_RL.best_params_)

# Realizando novo treinamento com safras 2024-10 a 2025-01 com concatenacao de dados base train e val
pipeline_RL.fit(X_train_final, y_train_final)

#### Grid Search e Treino com HPs escolhidos (Treino + Val) - LightGBM 

In [None]:
param_grid_LGBM = {
    "model__n_estimators": [250, 500],
    "model__max_depth": [4, 7]
}

cv = StratifiedKFold(
    n_splits=4,
    shuffle=True,
    random_state=42
)

grid_LGBM = GridSearchCV(
    pipeline_LGBM,
    param_grid=param_grid_LGBM,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=3,
    error_score="raise"
)

# Aplicando fit no conjunto de validacao (para selecao de hiperparametros)
grid_LGBM.fit(X_val, y_val)

In [None]:
# Exibicao dos melhores hiper-parametros e scores - Ajuste na base de validacao (safra 2025-01)
print("Melhores hiper-parâmetros LGBM:", grid_LGBM.best_params_)
print("Melhor score (AUC) LGBM: ", grid_LGBM.best_score_)

In [None]:
# Setando melhores hiperparametros encontrados no grid search ao modelo
pipeline_LGBM.set_params(**grid_LGBM.best_params_)

# Realizando novo treinamento com safras 2024-10 a 2025-01 com concatenacao de dados base train e val
pipeline_LGBM.fit(X_train_final, y_train_final)

## 7. Avaliação por Safra (AUC e KS)

#### Processo realizado nas bases de treino, val e OOT (individuais e consolidadas)

In [None]:
# Selecao de dados das bases a partir da safra (para X e y)
# X - df de features original
# y - serie com target original
# list_safras = lista com periodo(s) desejados para filtragem
def filter_xy_by_safra(X, y, list_safras):
    X_f = X[X["SAFRA"].isin(list_safras)]
    y_f = y.loc[X_f.index]
    return X_f, y_f

In [None]:
# Funcao para calculo da metrica KS
def ks_stat(y_true, y_score):
    df = pd.DataFrame({"y": y_true, "p": y_score})
    df = df.sort_values("p")
    df["cum_good"] = (1 - df["y"]).cumsum() / (1 - df["y"]).sum()
    df["cum_bad"]  = df["y"].cumsum() / df["y"].sum()
    return np.max(np.abs(df["cum_bad"] - df["cum_good"]))

# Funcao de avaliacao das metricas de KS e AUC
def evaluation_auc_ks(X, y, pipe, name_evaluation="", verbose=True):
    p_s = pipe.predict_proba(X)[:, 1]
    auc = round(roc_auc_score(y, p_s),5)
    ks = round(ks_stat(y, p_s),5)
    if verbose:
        print(f"AVALIACAO NA(S) SAFRA(S) {name_evaluation}: ")
        print("AUC:", auc)
        print("KS :", ks)
        print("-" * 30)
    return auc, ks

# Funcao para logar metricas de avaliacao no MLflow por safra
def log_safra_metrics_mlflow(model_name, dict_safras, generate_map_func, 
                              X_train, y_train, X_oos, y_oos, X_oot, y_oot, pipeline):
    """Loga metricas AUC e KS por safra no MLflow para um dado modelo/pipeline."""
    results = {}
    for key, value in dict_safras.items():
        map_step_data = generate_map_func(X_train, y_train, X_oos, y_oos, X_oot, y_oot)
        X = map_step_data[key]["X"]
        y = map_step_data[key]["Y"]
        X_f, y_f = filter_xy_by_safra(X, y, dict_safras[key])
        auc, ks = evaluation_auc_ks(X_f, y_f, pipeline, key)
        
        # Sanitizar nome para MLflow (remover caracteres especiais)
        safe_key = key.replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "")
        mlflow.log_metric(f"{model_name}_AUC_{safe_key}", auc)
        mlflow.log_metric(f"{model_name}_KS_{safe_key}", ks)
        results[key] = {"AUC": auc, "KS": ks}
    
    return results

# Dicionarios para facilitar filtragem de dados das bases X e y de acordo com safras desejadas para analise
dict_safras = {
    "TREINO - 202410" : [202410],
    "TREINO - 202411" : [202411],
    "TREINO - 202412" : [202412],
    "TREINO \ VAL - 202501" : [202501],
    "TREINO (CONS)" : [202410, 202411, 202412, 202501],
    "OOS - 202410" : [202410],
    "OOS - 202411" : [202411],
    "OOS - 202412" : [202412],
    "OOS - 202501" : [202501],
    "OOS (CONS)" : [202410, 202411, 202412, 202501],    
    "OOT - 202502" : [202502],
    "OOT - 202503" : [202503],
    "OOT GERAL (CONS)" : [202502, 202503]
}

def generate_map_step_data(X_train, y_train, X_oos, y_oos, X_oot, y_oot):
    map_step_data = {
        "TREINO - 202410" : {"X": X_train, "Y" : y_train},
        "TREINO - 202411" : {"X": X_train, "Y" : y_train},
        "TREINO - 202412" : {"X": X_train, "Y" : y_train},
        "TREINO \ VAL - 202501" : {"X": X_train, "Y" : y_train},
        "TREINO (CONS)" : {"X": X_train, "Y" : y_train},
        "OOS - 202410" : {"X": X_oos, "Y" : y_oos},
        "OOS - 202411" : {"X": X_oos, "Y" : y_oos},
        "OOS - 202412" : {"X": X_oos, "Y" : y_oos},
        "OOS - 202501" : {"X": X_oos, "Y" : y_oos},
        "OOS (CONS)" : {"X": X_oos, "Y" : y_oos},
        "OOT - 202502" : {"X": X_oot, "Y" : y_oot},
        "OOT - 202503" : {"X": X_oot, "Y" : y_oot},
        "OOT GERAL (CONS)" : {"X": X_oot, "Y" : y_oot},
    }
    return map_step_data

### Avaliacao por safra inclui: Treino, OOS, OOT (individuais e consolidadas) + MLflow logging

In [None]:
### Avalicao dos resultados nas safras treino, val e OOT - Regressao Logistica
with mlflow.start_run(run_name="LogisticRegression_Baseline") as run_rl:
    # Logar parametros do modelo
    best_params_rl = pipeline_RL.named_steps["model"].get_params()
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("penalty", best_params_rl.get("penalty", "l1"))
    mlflow.log_param("solver", best_params_rl.get("solver", "saga"))
    mlflow.log_param("C", best_params_rl.get("C"))
    mlflow.log_param("max_iter", best_params_rl.get("max_iter"))
    mlflow.log_param("n_features", len(X_train_final.columns))
    mlflow.log_param("n_samples_train", len(X_train_final))
    
    # Logar metricas por safra
    print("Avaliacao das metricas da Regressao Logistica por base: ")
    results_rl = log_safra_metrics_mlflow(
        "RL", dict_safras, generate_map_step_data,
        X_train_final, y_train_final, X_oos_agg, y_oos_agg, X_oot_agg, y_oot_agg,
        pipeline_RL
    )
    
    # Logar modelo
    mlflow.sklearn.log_model(pipeline_RL, "model_logistic_regression")
    
    # Logar coeficientes como artefato
    coefs = pipeline_RL.named_steps["model"].coef_[0]
    feature_names = pipeline_RL.named_steps["prep"].get_feature_names_out()
    df_coefs = pd.DataFrame({"feature": feature_names, "coef": coefs}).sort_values("coef", key=abs, ascending=False)
    df_coefs.to_csv("/tmp/lr_coefficients.csv", index=False)
    mlflow.log_artifact("/tmp/lr_coefficients.csv", "feature_analysis")
    
    print(f"\nMLflow Run ID (RL): {run_rl.info.run_id}")

In [None]:
### Avalicao dos resultados nas safras treino, val e OOT - LightGBM
with mlflow.start_run(run_name="LightGBM_Baseline") as run_lgbm:
    # Logar parametros do modelo
    best_params_lgbm = pipeline_LGBM.named_steps["model"].get_params()
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("n_estimators", best_params_lgbm.get("n_estimators"))
    mlflow.log_param("max_depth", best_params_lgbm.get("max_depth"))
    mlflow.log_param("learning_rate", best_params_lgbm.get("learning_rate"))
    mlflow.log_param("num_leaves", best_params_lgbm.get("num_leaves"))
    mlflow.log_param("boosting_type", best_params_lgbm.get("boosting_type", "gbdt"))
    mlflow.log_param("n_features", len(X_train_final.columns))
    mlflow.log_param("n_samples_train", len(X_train_final))
    
    # Logar metricas por safra
    print("Avaliacao das metricas do LightGBM por base: ")
    results_lgbm = log_safra_metrics_mlflow(
        "LGBM", dict_safras, generate_map_step_data,
        X_train_final, y_train_final, X_oos_agg, y_oos_agg, X_oot_agg, y_oot_agg,
        pipeline_LGBM
    )
    
    # Logar modelo
    mlflow.sklearn.log_model(pipeline_LGBM, "model_lightgbm")
    
    # Logar feature importance como artefato
    lgbm_model = pipeline_LGBM.named_steps["model"]
    feature_names = pipeline_LGBM.named_steps["prep"].get_feature_names_out()
    df_importance = pd.DataFrame({
        "feature": feature_names,
        "importance": lgbm_model.feature_importances_
    }).sort_values("importance", ascending=False)
    df_importance.to_csv("/tmp/lgbm_feature_importance.csv", index=False)
    mlflow.log_artifact("/tmp/lgbm_feature_importance.csv", "feature_analysis")
    
    # Plotar e logar feature importance (top 30)
    fig, ax = plt.subplots(figsize=(10, 8))
    top_30 = df_importance.head(30)
    ax.barh(range(len(top_30)), top_30["importance"].values)
    ax.set_yticks(range(len(top_30)))
    ax.set_yticklabels(top_30["feature"].values, fontsize=8)
    ax.set_xlabel("Feature Importance")
    ax.set_title("Top 30 Features - LightGBM")
    ax.invert_yaxis()
    plt.tight_layout()
    fig.savefig("/tmp/lgbm_feature_importance.png", dpi=150)
    mlflow.log_artifact("/tmp/lgbm_feature_importance.png", "plots")
    plt.show()
    
    print(f"\nMLflow Run ID (LGBM): {run_lgbm.info.run_id}")

### Adicionar feature selection via LGBM em cada step (Considerando limiar minimo)

In [None]:
# Mensuracao do ganho de KS por fonte de dados

## Colunas por fonte - Etapas do Incremento das features
# Score 1 (Base Bureau) - Target Score 1
# Score 2 (Base Bureau) - Target Score 2
# Base Cadastros - STATUSRF, var_02 a var_25, UF, REGIAO
# Base Telco - var_26 a var_93
# Base Recargas - REC_...
# Base Pagamentos - PAG_...
# Base Faturamento - FAT_...

In [None]:
# Funcao para aplicacao de filtragem das features de acordo com etapa atual
def filter_features(X_train, X_oot_agg, list_features):
    X_train_filtered = X_train[list_features]
    X_oot_agg_filtered = X_oot_agg[list_features]
    return X_train_filtered, X_oot_agg_filtered

In [None]:
# Funcao que define em qual etapa de agregacao de features estamos no momento atual
def current_step(step_num):
    dict_step = {
        0 : "SC 1",
        1 : "SC 1 + SC 2",
        2 : "SC 1 + SC 2 + CAD",
        3 : "SC 1 + SC 2 + CAD + TELCO",
        4 : "SC 1 + SC 2 + CAD + TELCO + REC",
        5 : "SC 1 + SC 2 + CAD + TELCO + REC + PAG",
        6 : "SC 1 + SC 2 + CAD + TELCO + REC + PAG + FAT",
    }
    return dict_step[step_num]

In [None]:
# Funcao que atualiza o pipeline do modelo (incluindo as colunas a serem consideradas para o treino)
def update_pipeline(X, name_model):

    num_features = [n for n in X.select_dtypes(include=["int32", "int64", "float32", "float64"]).columns if n != "SAFRA"] 
    cat_features = [c for c in X.select_dtypes(include=["object", "category"]).columns if c != "NUM_CPF"]

    if name_model == "Reg Log":
        
        numeric_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ])

        categorical_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", CountEncoder(normalize=True))
        ])

        preprocess = ColumnTransformer([
            ("num", numeric_pipe, num_features),
            ("cat", categorical_pipe, cat_features)
        ])

        model = LogisticRegression(
            solver="liblinear",
            penalty="l1",
            max_iter=2000,
            C=grid_RL.best_params_["model__C"],
            tol=1e-3,
            class_weight="balanced",
            random_state=42
        )

    else: # LightGBM

        numeric_pipe = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ])

        categorical_pipe = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", CountEncoder(
                normalize=True,
                handle_unknown=0,
                handle_missing=0
            ))
        ])

        preprocess = ColumnTransformer(
            transformers=[
                ("num", numeric_pipe, num_features),
                ("cat", categorical_pipe, cat_features),
            ],
            remainder="drop"
        )

        model = LGBMClassifier(
            objective="binary",
            boosting_type="gbdt",
            learning_rate=0.05,
            max_depth=grid_LGBM.best_params_["model__max_depth"],
            n_estimators=grid_LGBM.best_params_["model__n_estimators"],
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        )

    pipeline = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])

    return pipeline

In [None]:
# Treinamento e avalicao de modelos fonte a fonte
# Definicao do conjunto de variaveis para cada uma das fontes
feat_score_1 = ["TARGET_SCORE_01"]
feat_score_2 = ["TARGET_SCORE_02"]
feats_cadastro = [x for x in X_train_final.columns if "var_" in x and x <= "var_25"] + ["STATUSRF", "UF", "REGIAO", "DIAS_VAR_12"]
feats_telco = [x for x in X_train_final.columns if "var_" in x and x >= "var_26"]
feats_recargas = [x for x in X_train_final.columns if "REC_" in x]
feats_pagamentos = [x for x in X_train_final.columns if "PAG_" in x]
feats_faturamento = [x for x in X_train_final.columns if "FAT_" in x]

# Execucao de loop de treinamento com inclusao de features incrementais por fonte
list_sources = [feat_score_1, feat_score_2, feats_cadastro, feats_telco, feats_recargas, feats_pagamentos, feats_faturamento]
list_features = ["NUM_CPF", "SAFRA"] # Entrarao como colunas para filtragem dos dados, mas nao como features
list_dict_results = []
list_models = ["Reg Log", "LGBM"]

# Iteracao por cada uma das fontes
for idx, source in enumerate(list_sources):
    list_features.extend(source) # Montando a lista de features que sera utilizada como
    
    # Filtragem dos dados para grupo de features atual
    X_train_final_filtered, X_oot_agg_filtered = filter_features(X_train_final, X_oot_agg, list_features)
    
    # Selecao de qual modelo sera treinado/avaliado
    for model in list_models:

        # Atualizacao dos tipos de dados por coluna (Para nao quebrar o Pipeline)
        pipeline_ks_inc = update_pipeline(X_train_final_filtered, name_model = model)

        # Treinamento do modelo considerando lista de features atual
        pipeline_ks_inc.fit(X_train_final_filtered, y_train_final)

        for key, value in dict_safras.items():
            map_step_data = generate_map_step_data(X_train_final_filtered, y_train_final, X_oot_agg_filtered, y_oot_agg)
            X = map_step_data[key]["X"]
            y = map_step_data[key]["Y"]
            X_f , y_f = filter_xy_by_safra(X, y, dict_safras[key])
            auc, ks = evaluation_auc_ks(X_f, y_f, pipeline_ks_inc, key, verbose=False)
            conj_features = current_step(idx)
            dict_result = {
                "MODELO" : model,
                "CONJ FEATURES" : conj_features,
                "BASE" : key,
                "AUC" : auc,
                "KS" : ks
            }
            list_dict_results.append(dict_result)

# Dataframe com resultados - apos avaliacoes em cada uma das bases
df_results_ks_inc = pd.DataFrame(list_dict_results)
df_results_ks_inc

In [None]:
# Analise da evolucao de AUC e KS nas bases Treino + Val para cada grupo de features
df_results_ks_inc[df_results_ks_inc["BASE"] == "TREINO (CONS)"]

In [None]:
# Analise da evolucao de AUC e KS nas bases OOT Geral para cada grupo de features
df_results_ks_inc[df_results_ks_inc["BASE"] == "OOT GERAL (CONS)"]

In [None]:
# Remocao de features da source "Cadastro" - Gerou pouco ganho / perda de AUC e KS ao modelo
list_sources_final = [feat_score_1, feat_score_2, feats_telco,feats_recargas, feats_pagamentos, feats_faturamento]

list_source_final_columns = []
for x in list_sources_final:
    list_source_final_columns.extend(x)

X_train_final_filtered_sources = X_train_final[list_source_final_columns]

In [None]:
# Feature Selection - Univariada (IV)
def calculate_iv_numericals(df, feature, target, total_good, total_bad, bins=10):
    s = df[feature]

    if s.nunique() < 2:
        return 0.0

    try:
        bins_series = pd.qcut(s, q=bins, duplicates="drop")
    except ValueError:
        return 0.0

    grouped = (
        df.assign(bin=bins_series)
        .groupby("bin")[target]
        .value_counts()
        .unstack(fill_value=0)
    )

    good = grouped.get(0, 0) + 0.5
    bad  = grouped.get(1, 0) + 0.5

    dist_good = good / total_good
    dist_bad  = bad / total_bad

    iv = ((dist_bad - dist_good) * np.log(dist_bad / dist_good)).sum()
    return iv


def calculate_iv_categoricals(df, feature, target, total_good, total_bad):
    s = df[feature].astype("str").fillna("MISSING")

    grouped = (
        df.assign(cat=s)
        .groupby("cat")[target]
        .value_counts()
        .unstack(fill_value=0)
    )

    good = grouped.get(0, 0) + 0.5
    bad  = grouped.get(1, 0) + 0.5

    dist_good = good / total_good
    dist_bad  = bad / total_bad

    iv = ((dist_bad - dist_good) * np.log(dist_bad / dist_good)).sum()
    return iv


dict_ivs = {}
concated_data = pd.concat([X_train_final_filtered_sources, y_train_final], axis=1)
df_iv = concated_data.copy()
total_good = (df_iv["FPD"] == 0).sum()
total_bad  = (df_iv["FPD"] == 1).sum()

# Separacao das colunas numericas e categoricas apos filtragem das features por source
num_columns = [n for n in X_train_final_filtered_sources.select_dtypes(include=["int32", "int64", "float32", "float64"]).columns if n != "SAFRA"] 
cat_columns = [c for c in X_train_final_filtered_sources.select_dtypes(include=["object", "category"]).columns if c != "NUM_CPF"]

# IV das colunas numericas
for column in num_columns:
    iv = calculate_iv_numericals(df_iv, column, "FPD", total_good, total_bad)
    dict_ivs[column] = iv

# IV das colunas categoricas
for column in cat_columns:
    iv = calculate_iv_categoricals(df_iv, column, "FPD", total_good, total_bad)
    dict_ivs[column] = iv

# Valor de corte do IV
iv_min = 0.02

df_ivs = (
    pd.Series(dict_ivs, name="IV")
    .reset_index()
    .rename(columns={"index": "feature"})
    .sort_values("IV", ascending=False)
)

df_ivs_filtered = df_ivs[df_ivs["IV"] < iv_min]
features_to_drop_iv = df_ivs_filtered["feature"].unique()
df_ivs_filtered

In [None]:
# Feature Selection - Coeficientes L1 (Regressao Logistica)
# Treinar RL com features filtradas (sem Cadastro) para extrair coeficientes
pipeline_coefs = update_pipeline(X_train_final_filtered_sources, name_model="Reg Log")
pipeline_coefs.fit(X_train_final_filtered_sources, y_train_final)

# Extrair coeficientes do modelo RL
coefs = pipeline_coefs.named_steps["model"].coef_[0]
feature_names_coefs = pipeline_coefs.named_steps["prep"].get_feature_names_out()

df_coefs = pd.DataFrame({"feature": feature_names_coefs, "coef": coefs})
df_coefs["abs_coef"] = df_coefs["coef"].abs()
df_coefs = df_coefs.sort_values("abs_coef", ascending=False)

# Features com coeficiente zero (removidas pelo L1)
features_to_drop_coefs = df_coefs[df_coefs["abs_coef"] == 0]["feature"].tolist()

print(f"Features com coeficiente L1 = 0: {len(features_to_drop_coefs)}")
print(f"Features restantes: {len(feature_names_coefs) - len(features_to_drop_coefs)}")
df_coefs.head(20)

In [None]:
# Feature Selection - Remocao de features com alta correlacao (> 0.95)
corr_threshold = 0.95
num_cols_for_corr = X_train_final_filtered_sources.select_dtypes(include=["int32", "int64", "float32", "float64"]).columns.tolist()

corr_matrix = X_train_final_filtered_sources[num_cols_for_corr].corr().abs()

# Triangulo superior (evitar duplicatas)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Features com correlacao acima do threshold
features_to_drop_corrs = [column for column in upper.columns if any(upper[column] > corr_threshold)]

print(f"Features com correlacao > {corr_threshold}: {len(features_to_drop_corrs)}")
if features_to_drop_corrs:
    print(f"  Removidas: {features_to_drop_corrs[:10]}{'...' if len(features_to_drop_corrs) > 10 else ''}")

In [None]:
# Feature Selection - LGBM Feature Importance
pipeline_lgbm_fi = update_pipeline(X_train_final_filtered_sources, name_model="LGBM")
pipeline_lgbm_fi.fit(X_train_final_filtered_sources, y_train_final)

# Extrair feature importance
lgbm_model_fi = pipeline_lgbm_fi.named_steps["model"]
feature_names_lgbm = pipeline_lgbm_fi.named_steps["prep"].get_feature_names_out()

df_lgbm_importance = pd.DataFrame({
    "feature": feature_names_lgbm,
    "importance": lgbm_model_fi.feature_importances_
}).sort_values("importance", ascending=False)

# Top features por importance (manter top 70 para LGBM)
TOP_N_LGBM = 70
top_lgbm_features = df_lgbm_importance.head(TOP_N_LGBM)["feature"].tolist()

print(f"LGBM Feature Importance - Top {TOP_N_LGBM} features selecionadas")
print(f"Importance cumulativa do top {TOP_N_LGBM}: {df_lgbm_importance.head(TOP_N_LGBM)['importance'].sum() / df_lgbm_importance['importance'].sum():.1%}")

# Plotar top 30
fig, ax = plt.subplots(figsize=(10, 8))
top_30_fi = df_lgbm_importance.head(30)
ax.barh(range(len(top_30_fi)), top_30_fi["importance"].values)
ax.set_yticks(range(len(top_30_fi)))
ax.set_yticklabels(top_30_fi["feature"].values, fontsize=8)
ax.set_xlabel("Feature Importance (LGBM)")
ax.set_title("Top 30 Features - LGBM Feature Importance")
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Selecao de features final - Removendo features descartadas de IV, Coefs L1 e alta correlacao
total_features_to_drop = set(features_to_drop_iv).union(set(features_to_drop_coefs)).union(set(features_to_drop_corrs))
# total_features_to_drop = set(features_to_drop_iv).union(set(features_to_drop_coefs))
final_set_features = list(set(list_source_final_columns) - set(total_features_to_drop))

print(f"Drop de features IV : {len(features_to_drop_iv)}")
print(f"Drop de features Coefs RegLog: {len(features_to_drop_coefs)}")
print(f"Drop de features Alta Correlacao : {len(features_to_drop_corrs)}")
print(f"Total de features a dropar : {len(total_features_to_drop)}")
print(f"Total de colunas pos remocoes iniciais (Cadastros) : {len(list_source_final_columns)}")
print(f"Total de features restantes : {len(list_source_final_columns) - len(total_features_to_drop)}")

In [None]:
# Treinamento e avaliacao dos modelos finais com features selecionadas + MLflow logging
dimensions = ["NUM_CPF", "SAFRA"]
final_set_features_with_dims = final_set_features + [d for d in dimensions if d not in final_set_features]
models = ["Reg Log", "LGBM"]
list_dict_results_feat_selection = []

# Filtragem dos dados para grupo de features atual
X_train_final_feat_selection, X_oot_agg_feat_selection = filter_features(X_train_final, X_oot_agg, final_set_features_with_dims)
_, X_oos_agg_feat_selection = filter_features(X_train_final, X_oos_agg, final_set_features_with_dims)

best_model_name = None
best_model_pipeline = None
best_ks_oot = 0

for model in models:
    run_name = f"Final_{model.replace(' ', '')}_FeatureSelection"
    
    with mlflow.start_run(run_name=run_name) as run:
        # Atualizacao dos tipos de dados esperados no pipeline
        pipeline_feat_selection = update_pipeline(X_train_final_feat_selection, name_model=model)

        # Treinamento do modelo considerando lista de features filtrada
        pipeline_feat_selection.fit(X_train_final_feat_selection, y_train_final)

        # Logar parametros
        mlflow.log_param("model_type", model)
        mlflow.log_param("n_features", len(final_set_features))
        mlflow.log_param("feature_selection", "IV + L1_coefs + high_corr")
        
        model_params = pipeline_feat_selection.named_steps["model"].get_params()
        for k, v in model_params.items():
            if isinstance(v, (int, float, str, bool)):
                mlflow.log_param(f"model__{k}", v)

        # Avaliacao por safra (Treino + OOS + OOT)
        print(f"\n{'='*60}")
        print(f"MODELO FINAL: {model} ({len(final_set_features)} features)")
        print(f"{'='*60}")
        
        for key, value in dict_safras.items():
            map_step_data = generate_map_step_data(
                X_train_final_feat_selection, y_train_final, 
                X_oos_agg_feat_selection, y_oos_agg, 
                X_oot_agg_feat_selection, y_oot_agg
            )
            X = map_step_data[key]["X"]
            y = map_step_data[key]["Y"]
            X_f, y_f = filter_xy_by_safra(X, y, dict_safras[key])
            auc, ks = evaluation_auc_ks(X_f, y_f, pipeline_feat_selection, key, verbose=True)
            
            # Logar metrica no MLflow
            safe_key = key.replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "")
            mlflow.log_metric(f"AUC_{safe_key}", auc)
            mlflow.log_metric(f"KS_{safe_key}", ks)
            
            dict_result = {
                "MODEL": model,
                "BASE": key,
                "AUC": auc,
                "KS": ks
            }
            list_dict_results_feat_selection.append(dict_result)
            
            # Rastrear melhor modelo por KS na OOT consolidada
            if key == "OOT GERAL (CONS)" and ks > best_ks_oot:
                best_ks_oot = ks
                best_model_name = model
                best_model_pipeline = pipeline_feat_selection

        # Logar modelo
        mlflow.sklearn.log_model(pipeline_feat_selection, f"model_final_{model.replace(' ', '_').lower()}")
        
        print(f"\nMLflow Run ID ({model}): {run.info.run_id}")

# Dataframe com resultados
df_results_feat_selection = pd.DataFrame(list_dict_results_feat_selection)

print(f"\n{'='*60}")
print(f"MELHOR MODELO (por KS OOT): {best_model_name} — KS OOT = {best_ks_oot:.5f}")
print(f"Benchmark KS: 33.1% (0.331)")
print(f"{'='*60}")

df_results_feat_selection

## 8. Swap-in / Swap-out

In [None]:
# Swap-in / Swap-out Analysis
# Compara a ordenacao de clientes entre safras OOT usando o melhor modelo

def swap_analysis(df_ref, df_new, score_col="score", target_col="FPD", top_pct=0.1):
    """Calcula percentual de swap-in e swap-out entre dois rankings."""
    n = int(len(df_ref) * top_pct)
    if n == 0:
        return {"swap_in_%": 0, "swap_out_%": 0, "n_top": 0}

    ref_top = df_ref.nlargest(n, score_col)
    new_top = df_new.nlargest(n, score_col)

    swap_in = len(set(new_top.index) - set(ref_top.index))
    swap_out = len(set(ref_top.index) - set(new_top.index))

    # Taxa de default no top (para quantificar ganho)
    default_rate_ref = ref_top[target_col].mean() if target_col in ref_top.columns else None
    default_rate_new = new_top[target_col].mean() if target_col in new_top.columns else None

    return {
        "swap_in_%": round(swap_in / n * 100, 2),
        "swap_out_%": round(swap_out / n * 100, 2),
        "n_top": n,
        "default_rate_ref": round(default_rate_ref, 4) if default_rate_ref is not None else None,
        "default_rate_new": round(default_rate_new, 4) if default_rate_new is not None else None
    }

# Usar melhor modelo para gerar scores nas bases OOT
print(f"Modelo utilizado: {best_model_name}")
print(f"{'='*60}")

# Gerar scores OOT por safra
X_oot_feat = X_oot_agg_feat_selection.copy()
y_oot_feat = y_oot_agg.copy()

# Adicionar scores
scores_oot = best_model_pipeline.predict_proba(X_oot_feat)[:, 1]
df_swap = X_oot_feat[["SAFRA"]].copy()
df_swap["score"] = scores_oot
df_swap["FPD"] = y_oot_feat.values

safras_oot = sorted(df_swap["SAFRA"].unique())
print(f"Safras OOT disponíveis: {safras_oot}")

# Analise swap-in/swap-out entre safras OOT
if len(safras_oot) >= 2:
    for top_pct in [0.05, 0.10, 0.20, 0.30]:
        df_ref = df_swap[df_swap["SAFRA"] == safras_oot[0]].reset_index(drop=True)
        df_new = df_swap[df_swap["SAFRA"] == safras_oot[1]].reset_index(drop=True)
        
        swap = swap_analysis(df_ref, df_new, top_pct=top_pct)
        print(f"\nTop {top_pct:.0%} (n={swap['n_top']}):")
        print(f"  Swap-in:  {swap['swap_in_%']:.1f}%")
        print(f"  Swap-out: {swap['swap_out_%']:.1f}%")
        print(f"  Default Rate OOT1 ({safras_oot[0]}): {swap['default_rate_ref']}")
        print(f"  Default Rate OOT2 ({safras_oot[1]}): {swap['default_rate_new']}")
else:
    print("Apenas 1 safra OOT disponivel - swap analysis nao aplicavel")

In [None]:
# Visualizacoes Finais: KS Curve, Distribuicao de Scores, Confusion Matrix

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# --- 1. KS Curve (CDF) na base OOT consolidada ---
ax1 = axes[0, 0]
X_oot_vis = X_oot_agg_feat_selection.copy()
y_oot_vis = y_oot_agg.values
scores_oot_vis = best_model_pipeline.predict_proba(X_oot_vis)[:, 1]

df_ks = pd.DataFrame({"y": y_oot_vis, "score": scores_oot_vis}).sort_values("score")
df_ks["cum_good"] = (1 - df_ks["y"]).cumsum() / (1 - df_ks["y"]).sum()
df_ks["cum_bad"] = df_ks["y"].cumsum() / df_ks["y"].sum()
df_ks["ks_diff"] = np.abs(df_ks["cum_bad"] - df_ks["cum_good"])
ks_max_idx = df_ks["ks_diff"].idxmax()
ks_max_val = df_ks.loc[ks_max_idx, "ks_diff"]
ks_max_score = df_ks.loc[ks_max_idx, "score"]

x_axis = np.linspace(0, 1, len(df_ks))
ax1.plot(x_axis, df_ks["cum_good"].values, label="Bons (FPD=0)", color="blue")
ax1.plot(x_axis, df_ks["cum_bad"].values, label="Maus (FPD=1)", color="red")
ax1.axvline(x=x_axis[df_ks.index.get_loc(ks_max_idx)], color="green", linestyle="--", alpha=0.7)
ax1.set_title(f"KS Curve - OOT ({best_model_name}) | KS = {ks_max_val:.4f}")
ax1.set_xlabel("Populacao (%)")
ax1.set_ylabel("CDF")
ax1.legend()
ax1.grid(True, alpha=0.3)

# --- 2. Distribuicao de Scores por FPD ---
ax2 = axes[0, 1]
mask_good = y_oot_vis == 0
mask_bad = y_oot_vis == 1
ax2.hist(scores_oot_vis[mask_good], bins=50, alpha=0.6, label="Bons (FPD=0)", color="blue", density=True)
ax2.hist(scores_oot_vis[mask_bad], bins=50, alpha=0.6, label="Maus (FPD=1)", color="red", density=True)
ax2.set_title(f"Distribuicao de Scores - OOT ({best_model_name})")
ax2.set_xlabel("Score (Probabilidade de FPD)")
ax2.set_ylabel("Densidade")
ax2.legend()
ax2.grid(True, alpha=0.3)

# --- 3. Confusion Matrix (threshold = 0.5) ---
ax3 = axes[1, 0]
y_pred_oot = (scores_oot_vis >= 0.5).astype(int)
cm = confusion_matrix(y_oot_vis, y_pred_oot)
im = ax3.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax3.set_title(f"Confusion Matrix - OOT ({best_model_name})")
ax3.set_ylabel("Real")
ax3.set_xlabel("Predito")
ax3.set_xticks([0, 1])
ax3.set_yticks([0, 1])
ax3.set_xticklabels(["Bom (0)", "Mau (1)"])
ax3.set_yticklabels(["Bom (0)", "Mau (1)"])
for i in range(2):
    for j in range(2):
        ax3.text(j, i, f"{cm[i, j]:,}", ha="center", va="center", 
                color="white" if cm[i, j] > cm.max()/2 else "black", fontsize=12)

# --- 4. KS por Safra (barplot) ---
ax4 = axes[1, 1]
df_oot_results = df_results_feat_selection[
    (df_results_feat_selection["MODEL"] == best_model_name) & 
    (df_results_feat_selection["BASE"].str.contains("OOT|OOS"))
]
if not df_oot_results.empty:
    bars = ax4.bar(range(len(df_oot_results)), df_oot_results["KS"].values, color="steelblue")
    ax4.set_xticks(range(len(df_oot_results)))
    ax4.set_xticklabels(df_oot_results["BASE"].values, rotation=45, ha="right", fontsize=8)
    ax4.axhline(y=0.331, color="red", linestyle="--", label="Benchmark KS = 33.1%")
    ax4.set_title(f"KS por Base - {best_model_name}")
    ax4.set_ylabel("KS")
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    for bar, val in zip(bars, df_oot_results["KS"].values):
        ax4.text(bar.get_x() + bar.get_width()/2, val + 0.005, f"{val:.3f}", ha="center", fontsize=8)

plt.tight_layout()

# Salvar e logar no MLflow
fig.savefig("/tmp/final_model_visualizations.png", dpi=150, bbox_inches="tight")
mlflow.log_artifact("/tmp/final_model_visualizations.png", "plots")
plt.show()

print(f"\nModelo final: {best_model_name}")
print(f"KS OOT: {best_ks_oot:.5f}")
print(f"Benchmark: 0.331")