In [None]:
from pyspark.sql.functions import col

In [None]:
TABLE_NAME = dbutils.widgets.get("TABLE_NAME")
string_schema = dbutils.widgets.get("QUERY")
ENABLED = dbutils.widgets.get("ENABLED")
SCHEMA = dbutils.widgets.get("SCHEMA")

In [None]:
# this is a sample parameter base on the table name TABLE1_
# string_schema is how you will query the json object formated from a dataframe, see step by step movement of dataframe for more information
# query need to be partitioned by a primary key so only latest data are inserted
# ENABLED is a job flag for databricks
# Schema is where the table will be created, by default the table is created on a mounted folder under mnt/databricks/{schema}/{tablename}
# make sure a control table exists for tracking last processed timestamp


# TABLE_NAME= "TABLE1_"
# string_schema = """
# SELECT 
#         after.SomeID AS SomeID
#        ,after.Comments         AS Comments
#        ,after.LoginID          AS LoginID
#        ,after.OpenedDT         AS OpenedDT
#        ,after.SavedDT          AS SavedDT
#        ,after.OriginalLoginID  AS OriginalLoginID
#        ,after.EditedMSID       AS EditedMSID
#        ,after.ApprovalLoginID  AS ApprovalLoginID
#        ,ts_ms as kafka_ts,
#         row_number() over (partition by after.SomeID order by ts_ms desc) rn
#     FROM tmp_{}
#     where op != 'd'
#     order by ts_ms asc
# """
# ENABLED = 'TRUE'
# SCHEMA = 'some_schema'

: 

In [None]:
raw = (spark.read
  .format("kafka")
  .option("kafka.bootstrap.servers", "kafkabrokeraddress:9092")
  .option("subscribe", "topic")
  .option("startingOffset", "earliest")
  .option("endingOffset", "latest")
  .option("kafka.group.id", "dbs_consumer1")
  .load()
)

In [None]:
from pyspark.sql.functions import max as sql_max
df = raw.selectExpr("""
                   from_json(cast(value as string), "STRUCT<payload: STRUCT<
                      after: 
                          MAP<STRING, STRING>,
                      op: STRING,
                      ts_ms: DOUBLE
                      >
                      >").payload.after as after""", 
                      """from_json(cast(value as string), "STRUCT<payload: STRUCT<
                      after: 
                          MAP<STRING, STRING>,
                      op: STRING,
                      ts_ms: DOUBLE
                      >
                      >").payload.ts_ms as ts_ms""",
                      """from_json(cast(value as string), "STRUCT<payload: STRUCT<
                      after: 
                          MAP<STRING, STRING>,
                      op: STRING,
                      ts_ms: DOUBLE
                      >
                      >").payload.op as op""",
                      'cast(timestamp as double) * 1000 as sync_ts' )



In [None]:
max_value = df.selectExpr("max(sync_ts) as max_value").collect()[0]['max_value']


1698243742948.0


In [None]:

# CLEAN DELTA LOCATION FIRST
if(ENABLED == 'TRUE'):

    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("Create Delta Table").getOrCreate()

    df.createOrReplaceTempView('tmp_' + TABLE_NAME)

    extracted_df = spark.sql(string_schema.format(TABLE_NAME))
    extracted_df = extracted_df.filter(col("rn")== 1)
    extracted_df = extracted_df.drop("rn")
    display(extracted_df)
    extracted_df.write.format("delta").mode("overwrite").save(f'/mnt/databricks/DELTA/{SCHEMA}/{TABLE_NAME}')


    final_table_name = f"{SCHEMA}.{TABLE_NAME}"


    spark.sql(f"""
        DROP TABLE IF EXISTS {final_table_name}
    """)

    spark.sql(f"""
        CREATE SCHEMA IF NOT EXISTS {SCHEMA}
    """)

    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {final_table_name}
        USING DELTA
        LOCATION '/mnt/databricks/DELTA/{SCHEMA}/{TABLE_NAME}'
    """)

    # if CDC == "TRUE":
    #     spark.sql(f"""
    #         ALTER TABLE {final_table_name} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
    #     """)

In [None]:
spark.sql(f"""
          delete from default.control_table 
          where schema_name = '{SCHEMA}' and table_name = '{TABLE_NAME}'
          """)

In [None]:
spark.sql(f"""
          insert into  default.control_table values ('{SCHEMA}', '{TABLE_NAME}', {max_value})
          """)

Out[29]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]