In [0]:
df_full = spark.read.format("parquet").load(f"/Volumes/raw/upcell/cdc/clientes/")
df_full.display 


## 1. Importações e Setup

In [0]:
import sys
import delta


sys.path.insert(0, "../lib/")

import utils

In [0]:
catalog = "bronze"
schema = "upcell"

#tablename = "produtos"
#id_field = "IdProduto"
#timefield = "DtAtualizacao"

tablename = dbutils.widgets.get("tablename")
id_field = dbutils.widgets.get("id_field")
timestamp_field = dbutils.widgets.get("timefield")

df_schema = utils.import_schema(tablename)

In [0]:
if not utils.table_exists(spark, catalog, schema, tablename):
    print("tabela nao existe")
    df_full = (
        spark.read
            .format("parquet")
            .schema(df_schema)
            .load(f"/Volumes/raw/upcell/full-load/{tablename}/")
    )
    (df_full.coalesce(1)
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(f"{catalog}.{schema}.{tablename}"))
else:
    print("tabela ja existe")

## 2. Configuração e Full-Load

Define variáveis de configuração e cria a tabela inicial se não existir.

In [0]:
bronze = delta.DeltaTable.forName(spark, f"{catalog}.{schema}.{tablename}")

In [0]:
def upsert(df, deltatable):
    df.createOrReplaceGlobalTempView(f"view_{tablename}")

    query = f"""
    SELECT *
    FROM global_temp.view_{tablename}
    QUALIFY ROW_NUMBER() OVER (PARTITION BY {id_field} ORDER BY {timefield} DESC) = 1
    """

    df_cdc = spark.sql(query)

    (deltatable.alias("b")
        .merge(df_cdc.alias("d"), f"b.{id_field} = d.{id_field}")
        .whenMatchedDelete(condition = "d.OP = 'D'")
        .whenMatchedUpdateAll(condition = "d.OP = 'U'")
        .whenNotMatchedInsertAll(condition = "d.OP = 'I' OR d.OP = 'U'")
        .execute()
    )

df_stream = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "parquet")
    .schema(df_schema)
    .load(f"/Volumes/raw/upcell/cdc/{tablename}/")
)

stream = (df_stream.writeStream
    .option("checkpointLocation", f"/Volumes/raw/upcell/cdc/{tablename}_checkpoint/")
    .foreachBatch(lambda df, batchID: upsert(df, bronze))
    .trigger(availableNow=True)
    
)


In [0]:
stream.start()