## 1. Importações e Setup

In [0]:
import sys
import delta

sys.path.insert(0, "../lib/")
import utils


In [0]:
catalog = "bronze"
schemaname = "upcell"

tablename = "produtos"
id_field = "IdProduto"
timefield = "DtAtualizacao"

# Access widget values outside of any asynchronous context
#tablename = dbutils.widgets.get("tablename")
#id_field = dbutils.widgets.get("id_field")
#timefield = dbutils.widgets.get("timefield")

ckeckpoint_location = f"/Volumes/raw/{schemaname}/cdc/{tablename}_checkpoint/"

In [0]:
class Ingestor:

    def __init__(self, catalog, schemaname, tablename, data_format):
        self.catalog = catalog
        self.schemaname = schemaname
        self.tablename = tablename
        self.data_format = data_format
        self.set_schema()
    
    def set_schema(self):
        self.data_schema = utils.import_schema(self.tablename)

    def load(self, path):
        df = (spark.read
                   .format(self.data_format)
                   .schema(self.data_schema)
                   .load(path))
        return df
    
    def save(self, df):
        (df.write
          .format("delta")
          .mode("overwrite")
          .saveAsTable(f"{self.catalog}.{self.schemaname}.{self.tablename}")
        )

    def execute(self, path):
        df = self.load(path)
        return self.save(df)

In [0]:
class IngestorCDC(Ingestor):

    def __init__(self, catalog, schemaname, tablename, data_format, id_field, timefield):
        super().__init__(catalog, schemaname, tablename, data_format)
        self.id_field = id_field
        self.timefield = timefield
        self.set_deltatable()

    def set_deltatable(self):
        tablename = f"{self.catalog}.{self.schemaname}.{self.tablename}"
        self.deltatable = delta.DeltaTable.forName(spark, tablename)

    def upsert(self, df):
        df.createOrReplaceGlobalTempView(f"view_{self.tablename}")

        query = f"""
            SELECT *
            FROM global_temp.view_{self.tablename}
            QUALIFY ROW_NUMBER() OVER (PARTITION BY {self.id_field} ORDER BY {self.timefield} DESC) = 1
        """

        df_cdc = spark.sql(query)

        (self.deltatable
             .alias("b")
             .merge(df_cdc.alias("d"), f"b.{self.id_field} = d.{self.id_field}")
             .whenMatchedDelete(condition="d.OP = 'D'")
             .whenMatchedUpdateAll(condition="d.OP = 'U'")
             .whenNotMatchedInsertAll(condition="d.OP = 'I' OR d.OP = 'U'")
             .execute())

    def load(self, path):
        df = (spark.readStream
                          .format("cloudFiles")
                          .option("cloudFiles.format", self.data_format)
                          .schema(self.data_schema)
                          .load(path))
        return df

    def save(self, df):
        stream = (df.writeStream
                    .option("checkpointLocation", f"/Volumes/raw/{self.schemaname}/cdc/{self.tablename}_checkpoint/")
                    .foreachBatch(lambda df_batch, batch_id: self.upsert(df_batch))
                    .trigger(processingTime="30 seconds"))
        return stream.start()

In [0]:
if not utils.table_exists(spark, catalog, schemaname, tablename):
    print("tabela nao existe, criando...")
    
    dbutils.fs.rm(ckeckpoint_location, True)

    ingest_full_load = Ingestor(catalog=catalog, 
                                schemaname=schemaname, 
                                tablename=tablename, 
                                data_format="parquet")
    ingest_full_load.execute(f"/Volumes/raw/{schemaname}/full-load/{tablename}/")
    
    print("tabela criada com sucesso")

else:
    print("tabela ja existe, ignorando full-load")

In [0]:
ingest_cdc = IngestorCDC(catalog=catalog, 
                        schemaname=schemaname, 
                        tablename=tablename, 
                        data_format="parquet", 
                        id_field=id_field,
                        timefield=timefield)
stream = ingest_cdc.execute(f"/Volumes/raw/{schemaname}/cdc/{tablename}/")