# Scoring Batch Pipeline - Credit Risk FPD

**Story**: HD-3.2 / Fase 4.3 — Deploy
**Objetivo**: Carregar modelo do MLflow Registry e aplicar scoring sobre novas SAFRAs
**Output**: `Gold.feature_store.clientes_scores` (particionado por SAFRA)

**Uso**: Parametrizar `SCORING_SAFRAS` e executar todas as celulas sequencialmente.

In [None]:
import sys
sys.path.insert(0, "/lakehouse/default/Files/projeto-final")

import logging
import json
from datetime import datetime

import numpy as np
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType, StructField

from config.pipeline_config import (
    PATH_FEATURE_STORE, GOLD_BASE, EXPERIMENT_NAME,
    SPARK_BROADCAST_THRESHOLD, SPARK_SHUFFLE_PARTITIONS, SPARK_AQE_ENABLED,
)

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")
logger = logging.getLogger("scoring_batch")

logger.info("Imports OK")

In [None]:
# =============================================================================
# PARAMETROS DE SCORING — ajustar antes de executar
# =============================================================================

# Modelo a usar (nome registrado no MLflow Model Registry)
MODEL_NAME = "credit-risk-fpd-lgbm_baseline"
MODEL_STAGE = "Production"  # ou "Staging" para testes

# SAFRAs para scoring (lista de int YYYYMM)
SCORING_SAFRAS = [202502, 202503]

# Output path
SCHEMA_SCORES = "feature_store"
TABLE_SCORES = "clientes_scores"
PATH_SCORES = f"{GOLD_BASE}/Tables/{SCHEMA_SCORES}/{TABLE_SCORES}"

# Faixas de risco (quintis)
N_FAIXAS = 5

logger.info("Modelo: %s (%s)", MODEL_NAME, MODEL_STAGE)
logger.info("SAFRAs para scoring: %s", SCORING_SAFRAS)
logger.info("Output: %s", PATH_SCORES)

In [None]:
# =============================================================================
# SPARK CONFIG
# =============================================================================
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", str(SPARK_BROADCAST_THRESHOLD))
spark.conf.set("spark.sql.adaptive.enabled", str(SPARK_AQE_ENABLED).lower())
spark.conf.set("spark.sql.shuffle.partitions", str(SPARK_SHUFFLE_PARTITIONS))
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

logger.info("Spark config OK (AQE=%s, shuffle=%d)", SPARK_AQE_ENABLED, SPARK_SHUFFLE_PARTITIONS)

In [None]:
# =============================================================================
# CARREGAR MODELO DO MLFLOW REGISTRY
# =============================================================================
client = MlflowClient()

# Buscar versao do modelo no stage especificado
model_versions = client.get_latest_versions(MODEL_NAME, stages=[MODEL_STAGE])
if not model_versions:
    raise RuntimeError(
        f"Nenhuma versao encontrada para '{MODEL_NAME}' no stage '{MODEL_STAGE}'. "
        f"Registre o modelo primeiro via export_model.py"
    )

mv = model_versions[0]
logger.info("Modelo encontrado: %s v%s (run_id=%s)", mv.name, mv.version, mv.run_id)

# Carregar modelo como pyfunc (funciona com sklearn e lightgbm)
model_uri = f"models:/{MODEL_NAME}/{MODEL_STAGE}"
model = mlflow.pyfunc.load_model(model_uri)
logger.info("Modelo carregado: %s", model_uri)

# Recuperar lista de features do run original
run = client.get_run(mv.run_id)
artifacts_path = client.download_artifacts(mv.run_id, "")

# Tentar carregar metadata JSON (gerado pelo export_model.py)
import glob
metadata_files = glob.glob(f"{artifacts_path}/*metadata*.json")
if metadata_files:
    with open(metadata_files[0]) as f:
        model_metadata = json.load(f)
    FEATURE_NAMES = model_metadata["feature_names"]
    logger.info("Features carregadas do metadata: %d features", len(FEATURE_NAMES))
else:
    logger.warning("Metadata JSON nao encontrado — usando features do run param")
    n_features = int(run.data.params.get("n_features", 0))
    raise RuntimeError(
        "Feature names nao disponiveis. Execute export_model.py primeiro para gerar metadata."
    )

print(f"\nModelo: {MODEL_NAME} v{mv.version}")
print(f"Features: {len(FEATURE_NAMES)}")
print(f"Primeiras 10: {FEATURE_NAMES[:10]}")

In [None]:
# =============================================================================
# CARREGAR FEATURE STORE
# =============================================================================
safra_list = ", ".join(str(s) for s in SCORING_SAFRAS)
df_feature_store = spark.read.format("delta").load(PATH_FEATURE_STORE) \
    .filter(f"SAFRA IN ({safra_list})")

total_records = df_feature_store.count()
logger.info("Feature store carregada: %d registros para SAFRAs %s", total_records, SCORING_SAFRAS)

if total_records == 0:
    raise RuntimeError(f"Nenhum registro encontrado para SAFRAs {SCORING_SAFRAS} no feature store")

# Verificar que todas as features existem
available_cols = set(df_feature_store.columns)
missing_features = [f for f in FEATURE_NAMES if f not in available_cols]
if missing_features:
    raise RuntimeError(
        f"{len(missing_features)} features ausentes no feature store: {missing_features[:10]}"
    )
logger.info("Todas as %d features encontradas no feature store", len(FEATURE_NAMES))

# Volumetria por SAFRA
df_feature_store.groupBy("SAFRA").count().orderBy("SAFRA").show()

In [None]:
# =============================================================================
# SCORING — por SAFRA (para controle de memoria)
# =============================================================================
all_scores = []

for safra in SCORING_SAFRAS:
    logger.info("Scoring SAFRA %d...", safra)

    # Filtrar SAFRA
    df_safra = df_feature_store.filter(f"SAFRA = {safra}")

    # Selecionar chaves + features
    df_keys = df_safra.select("NUM_CPF", "SAFRA").toPandas()
    df_X = df_safra.select(FEATURE_NAMES).toPandas()

    # Tratar missing (mesmo tratamento do treino — fillna 0 para numericos)
    for col in df_X.columns:
        if df_X[col].dtype in ["float64", "float32", "int64", "int32"]:
            df_X[col] = df_X[col].fillna(0)
        else:
            df_X[col] = df_X[col].fillna("MISSING")

    # Predizer probabilidade da classe positiva (FPD=1)
    scores = model.predict(df_X)

    # Se o modelo retorna classes (0/1) em vez de probabilidades,
    # usar predict_proba via modelo unwrapped
    if hasattr(model, '_model_impl'):
        inner = model._model_impl
        if hasattr(inner, 'predict_proba'):
            scores = inner.predict_proba(df_X)[:, 1]
        else:
            scores = inner.predict(df_X)

    # Montar DataFrame de saida
    df_result = df_keys.copy()
    df_result["SCORE_PROB"] = scores.astype(float)

    # Score invertido (menor = melhor, padrao mercado credito)
    df_result["SCORE"] = (1000 * (1 - df_result["SCORE_PROB"])).round(0).astype(int)

    # Faixa de risco por quintil (1=menor risco, 5=maior risco)
    df_result["FAIXA_RISCO"] = pd.qcut(
        df_result["SCORE_PROB"],
        q=N_FAIXAS,
        labels=[i for i in range(1, N_FAIXAS + 1)],
        duplicates="drop"
    ).astype(int)

    df_result["MODEL_NAME"] = MODEL_NAME
    df_result["MODEL_VERSION"] = str(mv.version)
    df_result["DT_SCORING"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    all_scores.append(df_result)
    logger.info("  SAFRA %d: %d registros scored (score medio=%.0f)",
                safra, len(df_result), df_result["SCORE"].mean())

# Consolidar
df_all_scores = pd.concat(all_scores, ignore_index=True)
logger.info("Total scored: %d registros", len(df_all_scores))

print(f"\nDistribuicao de scores:")
print(df_all_scores["SCORE"].describe())
print(f"\nDistribuicao por faixa de risco:")
print(df_all_scores.groupby(["SAFRA", "FAIXA_RISCO"]).size().unstack(fill_value=0))

In [None]:
# =============================================================================
# SALVAR SCORES NO GOLD LAKEHOUSE (Delta)
# =============================================================================
schema = StructType([
    StructField("NUM_CPF", StringType(), True),
    StructField("SAFRA", IntegerType(), True),
    StructField("SCORE_PROB", DoubleType(), True),
    StructField("SCORE", IntegerType(), True),
    StructField("FAIXA_RISCO", IntegerType(), True),
    StructField("MODEL_NAME", StringType(), True),
    StructField("MODEL_VERSION", StringType(), True),
    StructField("DT_SCORING", StringType(), True),
])

df_spark_scores = spark.createDataFrame(df_all_scores, schema=schema)

for safra in SCORING_SAFRAS:
    df_safra = df_spark_scores.filter(f"SAFRA = {safra}")
    df_safra.write.format("delta") \
        .mode("overwrite") \
        .option("replaceWhere", f"SAFRA = {safra}") \
        .partitionBy("SAFRA") \
        .save(PATH_SCORES)
    logger.info("SAFRA %d escrita em %s", safra, PATH_SCORES)

# Validacao pos-escrita
df_check = spark.read.format("delta").load(PATH_SCORES)
print(f"\nValidacao pos-escrita:")
df_check.groupBy("SAFRA").count().orderBy("SAFRA").show()
print(f"Total: {df_check.count()} registros, {len(df_check.columns)} colunas")

In [None]:
# =============================================================================
# LOG DE SCORING NO MLFLOW
# =============================================================================
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=f"scoring_batch_{datetime.now().strftime('%Y%m%d_%H%M')}"):
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("model_version", mv.version)
    mlflow.log_param("model_stage", MODEL_STAGE)
    mlflow.log_param("scoring_safras", str(SCORING_SAFRAS))
    mlflow.log_param("n_features", len(FEATURE_NAMES))
    mlflow.log_param("output_path", PATH_SCORES)

    mlflow.log_metric("total_records_scored", len(df_all_scores))
    mlflow.log_metric("score_mean", float(df_all_scores["SCORE"].mean()))
    mlflow.log_metric("score_std", float(df_all_scores["SCORE"].std()))
    mlflow.log_metric("score_prob_mean", float(df_all_scores["SCORE_PROB"].mean()))

    for safra in SCORING_SAFRAS:
        mask = df_all_scores["SAFRA"] == safra
        mlflow.log_metric(f"records_safra_{safra}", int(mask.sum()))
        mlflow.log_metric(f"score_mean_safra_{safra}", float(df_all_scores.loc[mask, "SCORE"].mean()))

    # Salvar distribuicao como artefato CSV
    dist_path = "/tmp/scoring_distribution.csv"
    df_all_scores.groupby(["SAFRA", "FAIXA_RISCO"]).agg(
        count=("NUM_CPF", "count"),
        score_mean=("SCORE", "mean"),
        score_prob_mean=("SCORE_PROB", "mean"),
    ).reset_index().to_csv(dist_path, index=False)
    mlflow.log_artifact(dist_path)

    run_id = mlflow.active_run().info.run_id
    logger.info("MLflow scoring run: %s", run_id)

print(f"\nScoring batch concluido com sucesso!")
print(f"MLflow Run ID: {run_id}")
print(f"Output: {PATH_SCORES}")

## Resumo

| Item | Valor |
|------|-------|
| Modelo | `credit-risk-fpd-lgbm_baseline` |
| Stage | Production |
| Output | `Gold.feature_store.clientes_scores` |
| Particionado por | SAFRA |
| Colunas output | NUM_CPF, SAFRA, SCORE_PROB, SCORE, FAIXA_RISCO, MODEL_NAME, MODEL_VERSION, DT_SCORING |

**Proximos passos**: Executar `validacao_deploy.py` para confirmar que metricas do scoring == metricas da avaliacao.