# Gold — Agregación y KPIs

Propósito: Generar tablas agregadas, KPIs y vistas Golden listas para consumo analítico y BI.

Rol: capa final de preparación — consolida métricas y tablas optimizadas para consumo, reporting y modelos de negocio.

In [0]:
import traceback
from pyspark.sql import functions as F
from pyspark.sql.functions import current_timestamp


In [0]:
# 1. Leer la tabla Silver
df_silver = spark.read.table("workspace.credit_risk.silver_credit_risk_limpio")

In [0]:
# 2. Creamos la data limpia sin outliers
df_gold1 = (
    df_silver
    .filter("es_outlier = 0")
    .withColumn("fecha_procesamiento_tabla", current_timestamp())
)


In [0]:
# 2. Creamos la data con outliers
df_gold2 = (
    df_silver
    .filter("es_outlier = 1")
    .withColumn("fecha_procesamiento_tabla", current_timestamp())
)


In [0]:
# 3. Calcular los KPIs usando agregaciones de PySpark
df_gold_kpis = df_silver.select(
    # Conteos básicos
    F.count("*").alias("total_prestamos"),
    F.count_distinct("id_cliente").alias("total_clientes"),
    
    # Métricas de montos
    F.round(F.sum("monto"), 2).alias("cartera_total"),
    F.round(F.avg("monto"), 2).alias("monto_promedio"),
    F.round(F.max("monto"), 2).alias("monto_maximo"),
    F.round(F.min("monto"), 2).alias("monto_minimo"),
    
    # Métricas de tasas
    F.round(F.avg("tasa_interes"), 2).alias("tasa_promedio"),
    F.round(F.max("tasa_interes"), 2).alias("tasa_maxima"),
    F.round(F.min("tasa_interes"), 2).alias("tasa_minima"),
    
    # Métricas de riesgo (Equivalente a los CASE WHEN de SQL)
    F.sum(F.when(F.col("estado_pago") == 1, 1).otherwise(0)).alias("total_defaults"),
    F.sum(F.when(F.col("estado_pago") == 0, 1).otherwise(0)).alias("total_al_dia"),
    F.round(F.avg(F.when(F.col("estado_pago") == 1, 100.0).otherwise(0.0)), 2).alias("tasa_default_pct"),
    
    # Métricas de clientes
    F.round(F.avg("edad"), 1).alias("edad_promedio"),
    F.round(F.avg("ingreso_anual"), 2).alias("ingreso_promedio"),
    F.round(F.avg("anios_empleo"), 1).alias("anios_empleo_promedio"),
    
    # Ingresos estimados
    F.round(F.sum(F.col("monto") * F.col("tasa_interes") / 100), 2).alias("ingresos_intereses_estimado"),
    
    # Timestamp de auditoría
    F.current_timestamp().alias("fecha_actualizacion")
)

In [0]:

# 4. RESUMEN POR CALIFICACIÓN
df_gold_resumen_calificacion = spark.sql("""
                                         
CREATE OR REPLACE TABLE workspace.credit_risk.gold_resumen_calificacion AS
SELECT 
    calificacion,
    
    -- Volumen
    COUNT(*) as cantidad_prestamos,
    ROUND(SUM(monto), 2) as monto_total,
    ROUND(AVG(monto), 2) as monto_promedio,
    
    -- Participación
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as participacion_cantidad_pct,
    ROUND(SUM(monto) * 100.0 / SUM(SUM(monto)) OVER(), 2) as participacion_monto_pct,
    
    -- Tasas
    ROUND(AVG(tasa_interes), 2) as tasa_promedio,
    ROUND(MIN(tasa_interes), 2) as tasa_minima,
    ROUND(MAX(tasa_interes), 2) as tasa_maxima,
    
    -- Riesgo
    SUM(CASE WHEN estado_pago = 1 THEN 1 ELSE 0 END) as cantidad_defaults,
    ROUND(AVG(CASE WHEN estado_pago = 1 THEN 100.0 ELSE 0.0 END), 2) as tasa_default_pct,
    
    -- Perfil cliente
    ROUND(AVG(edad), 1) as edad_promedio,
    ROUND(AVG(ingreso_anual), 0) as ingreso_promedio,
    
    -- Ingresos
    ROUND(SUM(monto * tasa_interes / 100), 2) as ingresos_intereses,
    
    CURRENT_TIMESTAMP() as fecha_actualizacion
    
FROM workspace.credit_risk.golden_credit_risk_sin_outliers
GROUP BY calificacion
ORDER BY calificacion;
""")

In [0]:
# 5. RESUMEN POR PROPÓSITO
df_gold_resumen_proposito = spark.sql("""
CREATE OR REPLACE TABLE workspace.credit_risk.gold_resumen_proposito AS
SELECT 
    proposito,
    -- Volumen
    COUNT(*) as cantidad_prestamos,
    ROUND(SUM(monto), 2) as monto_total,
    ROUND(AVG(monto), 2) as monto_promedio,
    
    -- Participación en cartera
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as participacion_pct,
    
    -- Pricing
    ROUND(AVG(tasa_interes), 2) as tasa_promedio,
    
    -- Riesgo
    SUM(CASE WHEN estado_pago = 1 THEN 1 ELSE 0 END) as cantidad_defaults,
    ROUND(AVG(CASE WHEN estado_pago = 1 THEN 100.0 ELSE 0.0 END), 2) as tasa_default_pct,
    
    -- Calificaciones predominantes
    COUNT(CASE WHEN calificacion IN ('A','B','C') THEN 1 END) as prestamos_calidad_alta,
    COUNT(CASE WHEN calificacion IN ('D','E') THEN 1 END) as prestamos_calidad_media,
    COUNT(CASE WHEN calificacion IN ('F','G') THEN 1 END) as prestamos_calidad_baja,
    
    -- Perfil
    ROUND(AVG(ingreso_anual), 0) as ingreso_promedio,
    ROUND(AVG(edad), 1) as edad_promedio,
    
    -- Ranking
    RANK() OVER (ORDER BY SUM(monto) DESC) as ranking_volumen,
    
    CURRENT_TIMESTAMP() as fecha_actualizacion
    
FROM workspace.credit_risk.golden_credit_risk_sin_outliers
GROUP BY proposito
ORDER BY monto_total DESC;
""")


In [0]:
# 6. ANÁLISIS RIESGO POR SEGMENTO
df_gold_analisis_riesgo = spark.sql("""
CREATE OR REPLACE TABLE workspace.credit_risk.gold_analisis_riesgo AS
SELECT 
    calificacion,
    
    -- Segmentación por tipo vivienda
    tipo_vivienda,
    
    -- Volumen
    COUNT(*) as cantidad_prestamos,
    ROUND(SUM(monto), 2) as exposicion_total,
    ROUND(AVG(monto), 2) as monto_promedio,
    
    -- Tasas
    ROUND(AVG(tasa_interes), 2) as tasa_promedio,
    
    -- Riesgo detallado
    SUM(CASE WHEN estado_pago = 1 THEN 1 ELSE 0 END) as defaults,
    SUM(CASE WHEN estado_pago = 0 THEN 1 ELSE 0 END) as al_dia,
    ROUND(AVG(CASE WHEN estado_pago = 1 THEN 100.0 ELSE 0.0 END), 2) as tasa_default_pct,
    
    -- Monto en riesgo
    ROUND(SUM(CASE WHEN estado_pago = 1 THEN monto ELSE 0 END), 2) as monto_en_default,
    
    -- Perfil del segmento
    ROUND(AVG(ingreso_anual), 0) as ingreso_promedio,
    ROUND(AVG(edad), 1) as edad_promedio,
    ROUND(AVG(pct_ingreso * 100), 2) as pct_ingreso_comprometido,
    
    -- Categorización simple
    CASE 
        WHEN AVG(CASE WHEN estado_pago = 1 THEN 1.0 ELSE 0.0 END) < 0.10 THEN 'RIESGO BAJO'
        WHEN AVG(CASE WHEN estado_pago = 1 THEN 1.0 ELSE 0.0 END) < 0.20 THEN 'RIESGO MEDIO'
        ELSE 'RIESGO ALTO'
    END as categoria_riesgo,
    
    -- Ingresos
    ROUND(SUM(monto * tasa_interes / 100), 2) as ingresos_estimados,
    
    CURRENT_TIMESTAMP() as fecha_actualizacion
    
FROM workspace.credit_risk.golden_credit_risk_sin_outliers
GROUP BY calificacion, tipo_vivienda
ORDER BY exposicion_total DESC;
""")


In [0]:
try:
    # Guardar las tablas
    df_gold1.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("workspace.credit_risk.golden_sin_outliers")

    df_gold2.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("workspace.credit_risk.golden_solo_outliers")

    df_gold_kpis.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("workspace.credit_risk.golden_kpis_basic")

    df_gold_resumen_calificacion.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("workspace.credit_risk.golden_resumen_calificacion")

    df_gold_resumen_proposito.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("workspace.credit_risk.golden_resumen_proposito")

    df_gold_analisis_riesgo.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("workspace.credit_risk.golden_analisis_riesgo")

except Exception as e:
    # Tipo de error
    error_type = type(e).__name__
    # Descripcion de error
    error_summary = str(e)
    # Trazar el error
    error_trace = traceback.format_exc()

    # Error completo
    error_msg_full = "f{error_type}: {error_sumamary}/n{error_trace}"

    if len(error_msg_full) > 500:
        error_msg = error_msg_full[:500] + "\n[...]Error Truncado[...]"
    else:
        error_msg = error_msg_full

    dbutils.jobs.taskValues.set(key="error", value=error_msg)
    raise e
