In [10]:
from pathlib import Path


# PROJECT_ROOT = Path().resolve()  # se o notebook estiver em /notebooks, pode ajustar abaixo
PROJECT_ROOT = Path().resolve().parents[0]  # (se necessário)

IPCA_PATH = (PROJECT_ROOT / "lakehouse" / "bronze" / "ipca").resolve()
BOI_PATH  = (PROJECT_ROOT / "lakehouse" / "bronze" / "boi_gordo").resolve()

print("IPCA:", IPCA_PATH)
print("BOI :", BOI_PATH)


IPCA: C:\Users\fdani\project_ipca_boi\lakehouse\bronze\ipca
BOI : C:\Users\fdani\project_ipca_boi\lakehouse\bronze\boi_gordo


In [11]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import os, sys


# --------------------------------------------
# Spark Session (igual aos pipelines Bronze)
# --------------------------------------------

def get_spark(app_name: str = "validation_bronze") -> SparkSession:
    
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
    builder = (
        SparkSession.builder
        .appName(app_name)
        .config("spark.sql.session.timeZone", "UTC")
        .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.sql.shuffle.partitions", "8")  # bom para local
    )
    return configure_spark_with_delta_pip(builder).getOrCreate()





In [12]:
# Abertura da sessão:

spark = get_spark()

In [13]:
# Leitura do IPCA do ano de 2025:
(
    spark.read
    .format("delta")
    .load(str(IPCA_PATH))
    .show(5)
)

+----------+-----+-------+---------+------------+----------+--------------------+--------------------+--------------------+--------------------+----+---+
|  data_ref|valor| source|series_id|data_inicial|data_final|         request_url|        payload_hash|       ingest_run_id|     ingested_at_utc| ano|mes|
+----------+-----+-------+---------+------------+----------+--------------------+--------------------+--------------------+--------------------+----+---+
|2025-02-01| 1.31|BCB_SGS|      433|  01/01/2025|31/12/2025|https://api.bcb.g...|edb41f451ba121bbe...|1796618f-301f-47a...|2026-02-06 01:52:...|2025|  2|
|2025-06-01| 0.24|BCB_SGS|      433|  01/01/2025|31/12/2025|https://api.bcb.g...|edb41f451ba121bbe...|1796618f-301f-47a...|2026-02-06 01:52:...|2025|  6|
|2025-05-01| 0.26|BCB_SGS|      433|  01/01/2025|31/12/2025|https://api.bcb.g...|edb41f451ba121bbe...|1796618f-301f-47a...|2026-02-06 01:52:...|2025|  5|
|2025-12-01| 0.33|BCB_SGS|      433|  01/01/2025|31/12/2025|https://api.bcb.

In [14]:
# Leitura do boi_gordo de 2025:
(
    spark.read
    .format("delta")
    .load(str(BOI_PATH))
    .show(5)
)

+--------------------+-------+------+--------------------+--------------------+-------------------+--------------------+--------------------+---+----+
|            csv_file|mes_ano| valor|         source_file|           file_hash|   converted_at_utc|       ingest_run_id|     ingested_at_utc|mes| ano|
+--------------------+-------+------+--------------------+--------------------+-------------------+--------------------+--------------------+---+----+
|cepea-consulta-20...|03/2025|312.47|cepea-consulta-20...|3f0d5e276f235d8d2...|2026-02-05 23:20:02|5092bde9-7049-435...|2026-02-06 00:45:...|  3|2025|
|cepea-consulta-20...|07/2025|299.97|cepea-consulta-20...|3f0d5e276f235d8d2...|2026-02-05 23:20:02|5092bde9-7049-435...|2026-02-06 00:45:...|  7|2025|
|cepea-consulta-20...|10/2025|310.51|cepea-consulta-20...|3f0d5e276f235d8d2...|2026-02-05 23:20:02|5092bde9-7049-435...|2026-02-06 00:45:...| 10|2025|
|cepea-consulta-20...|04/2025|323.96|cepea-consulta-20...|3f0d5e276f235d8d2...|2026-02-05 23:2

In [None]:
# validações de estrutura de IPCA:

from pyspark.sql import functions as F
from pyspark.sql.types import (
    DateType, TimestampType, StringType, IntegerType,
    DoubleType
)

# -----------------------
# Leitura Delta (cria df_ipca)
# -----------------------
df_ipca = (
    spark.read
    .format("delta")
    .load(str(IPCA_PATH))
)

# (Opcional) ver schema e amostra
print("=== IPCA: schema ===")
df_ipca.printSchema()
print("=== IPCA: sample ===")
df_ipca.show(5, truncate=False)

# -----------------------
# Validação de Schema / Tipos
# -----------------------
def validate_schema(df, expected: dict, df_name: str = "df"):
    schema = {f.name: type(f.dataType) for f in df.schema.fields}

    missing = [c for c in expected.keys() if c not in schema]
    extra   = [c for c in schema.keys() if c not in expected]
    wrong   = [
        (c, schema[c].__name__, expected[c].__name__)
        for c in expected.keys()
        if c in schema and schema[c] != expected[c]
    ]

    print(f"\n=== Validando schema: {df_name} ===")
    if missing: print(" Colunas faltando:", missing)
    if extra:   print(" Colunas extras:", extra)
    if wrong:
        print(" Tipos divergentes:")
        for c, got, exp in wrong:
            print(f"   - {c}: atual={got} esperado={exp}")

    if not missing and not wrong:
        print(" Schema OK (colunas e tipos conferem)")

# Ajuste para o schema real :
expected_ipca = {
    "data": DateType,
    "ano": IntegerType,
    "mes": IntegerType,
    "ipca_mensal": DoubleType,
    "ipca_acumulado_12m": DoubleType,
    "source_file": StringType,
    "ingestion_ts": TimestampType
}

validate_schema(df_ipca, expected_ipca, df_name="IPCA")


=== IPCA: schema ===
root
 |-- data_ref: date (nullable = true)
 |-- valor: double (nullable = true)
 |-- source: string (nullable = true)
 |-- series_id: integer (nullable = true)
 |-- data_inicial: string (nullable = true)
 |-- data_final: string (nullable = true)
 |-- request_url: string (nullable = true)
 |-- payload_hash: string (nullable = true)
 |-- ingest_run_id: string (nullable = true)
 |-- ingested_at_utc: timestamp (nullable = true)
 |-- ano: integer (nullable = true)
 |-- mes: integer (nullable = true)

=== IPCA: sample ===
+----------+-----+-------+---------+------------+----------+----------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+------------------------------------+--------------------------+----+---+
|data_ref  |valor|source |series_id|data_inicial|data_final|request_url                                                                          

In [None]:
# validações de estrutura de BOI_GORDO:

from pyspark.sql import functions as F
from pyspark.sql.types import (
    DateType, TimestampType, StringType, IntegerType,
    DoubleType
)

df_boi = (
    spark.read
    .format("delta")
    .load(str(BOI_PATH))
)

print("=== BOI: schema ===")
df_boi.printSchema()
print("=== BOI: sample ===")
df_boi.show(5, truncate=False)

# Exemplo (ajuste pro schema real)
expected_boi = {
    "data": DateType,
    "preco": DoubleType,
    "source_file": StringType,
    "ingestion_ts": TimestampType
}

validate_schema(df_boi, expected_boi, df_name="BOI")


=== BOI: schema ===
root
 |-- csv_file: string (nullable = true)
 |-- mes_ano: string (nullable = true)
 |-- valor: double (nullable = true)
 |-- source_file: string (nullable = true)
 |-- file_hash: string (nullable = true)
 |-- converted_at_utc: timestamp (nullable = true)
 |-- ingest_run_id: string (nullable = true)
 |-- ingested_at_utc: timestamp (nullable = true)
 |-- mes: integer (nullable = true)
 |-- ano: integer (nullable = true)

=== BOI: sample ===
+---------------------------------+-------+------+---------------------------------+----------------------------------------------------------------+-------------------+------------------------------------+--------------------------+---+----+
|csv_file                         |mes_ano|valor |source_file                      |file_hash                                                       |converted_at_utc   |ingest_run_id                       |ingested_at_utc           |mes|ano |
+---------------------------------+-------+------+

In [None]:
# spark.stop()

In [None]:
# Criar databases “bronze”:
# Isso faz o “schema/database” do catálogo ter um uma base de dados lógico DELTA LAKE:

BRONZE_DB_PATH = (PROJECT_ROOT / "lakehouse" / "bronze").as_posix()
spark.sql(f"CREATE DATABASE IF NOT EXISTS bronze LOCATION '{BRONZE_DB_PATH}'")
spark.sql("SHOW DATABASES").show(truncate=False)


+---------+
|namespace|
+---------+
|bronze   |
|default  |
+---------+



In [None]:
# Registrar tabelas Bronze existentes (que hoje só são “paths”):

spark.sql(f"""
CREATE TABLE IF NOT EXISTS bronze.ipca
USING DELTA
LOCATION '{IPCA_PATH.as_posix()}'
""")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS bronze.boi_gordo
USING DELTA
LOCATION '{BOI_PATH.as_posix()}'
""")




+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|bronze   |boi_gordo|false      |
|bronze   |ipca     |false      |
+---------+---------+-----------+



In [None]:
# A partir daqui você pode carregar e testar agora:

spark.sql("SHOW TABLES IN bronze").show(truncate=False)
spark.table("bronze.ipca").count()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|bronze   |boi_gordo|false      |
|bronze   |ipca     |false      |
+---------+---------+-----------+



12

In [19]:
spark.stop()