In [0]:
dbutils.widgets.text('table_name','')
dbutils.widgets.text('cdc_col','')
dbutils.widgets.text('key_col','')

In [0]:
table_name = dbutils.widgets.get('table_name')
cdc_col = dbutils.widgets.get('cdc_col')
key_col=dbutils.widgets.get('key_col')


In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col, monotonically_increasing_id, current_timestamp

delta_path = f'abfss://gold@storagetemp.dfs.core.windows.net/{table_name}/'

if DeltaTable.isDeltaTable(spark, delta_path):

    df = spark.read.format('delta').load(delta_path)
    max_date = df.agg({f"{cdc_col}": "max"}).collect()[0][0]
    max_surr_key = df.agg({'surrogate_key': 'max'}).collect()[0][0]

    incoming_data = spark.read.format('delta')\
                            .load(f'abfss://silver@storagetemp.dfs.core.windows.net/{table_name}/data/')\
                            .filter(col(f"{cdc_col}") > max_date)

    df_joined = incoming_data.alias("incoming_data").join(df.alias("sink"), on=f'{key_col}', how='left') \
        .select(
            *[col(f"incoming_data.{c}") for c in incoming_data.columns],
            col("sink.surrogate_key"),
            col("sink.created_date"),
            col("sink.updated_date")
        )

    new_record = df_joined.filter(col("surrogate_key").isNull()) \
        .withColumn('created_date', current_timestamp()) \
        .withColumn('updated_date', current_timestamp())

    old_record = df_joined.filter(col("surrogate_key").isNotNull()) \
        .withColumn('updated_date', current_timestamp())

    new_surr_key_record = new_record.withColumn('surrogate_key', monotonically_increasing_id() + max_surr_key + 1)
    final_df = new_surr_key_record.unionByName(old_record)
    trg = DeltaTable.forPath(spark, delta_path)
    (
        trg.alias('trg')
        .merge(
            final_df.alias('final_df'),
            'trg.surrogate_key = final_df.surrogate_key'
        )
        .whenMatchedUpdateAll(condition='trg.updated_date < final_df.updated_date')
        .whenNotMatchedInsertAll()
        .execute()
    )
else:

    df = spark.read.format('delta').load(f'abfss://silver@storagetemp.dfs.core.windows.net/{table_name}/data/')
    df = df.withColumn('surrogate_key', monotonically_increasing_id() + 1) \
        .withColumn('created_date', current_timestamp()) \
        .withColumn('updated_date', current_timestamp())
    df.write.format('delta').mode('overwrite').save(delta_path)