In [0]:
%run "../bronze_to_silver/config"

In [0]:
df_loan_payments_silver=spark.read.format("parquet").load(f"{silver_folder_path}/loan_payments/*.parquet")

In [0]:
from pyspark.sql.functions import crc32, concat, col,lit,current_timestamp

In [0]:
# add hashkey to source
df_src_hash=df_loan_payments_silver.\
    withColumn("src_hashkey",crc32(concat(col("payment_id").cast("string"),col("loan_id").cast("string"),col("payment_date").cast("string"),col("payment_amount").cast("string"))))

In [0]:
from delta.tables import DeltaTable

In [0]:
dbtable=DeltaTable.forPath(spark,f"{gold_folder_path}/loan_payments/")
df_tgt=dbtable.toDF()

In [0]:
# join src and tgt to filter new rows
df_final=df_src_hash.alias("src").\
    join(df_tgt.alias("tgt"),
         (col("src.payment_id")==col("tgt.payment_id")) & (col("src.src_hashkey")==col("tgt.hashkey")),
         "anti"
         )

In [0]:
dbtable.alias('tgt') \
  .merge(
    df_final.alias('src'),
    'src.payment_id = tgt.payment_id'
  ) \
  .whenMatchedUpdate(set =
    {
      "tgt.payment_id":"src.payment_id",
      "tgt.loan_id":"src.loan_id",
       "tgt.payment_date":"src.payment_date",
       "tgt.payment_amount":"src.payment_amount",
       "tgt.hashkey":"src.src_hashkey",
       "tgt.updateddate":current_timestamp(),
       "tgt.updatedby":lit("databricks-updated")
    }
  ) \
  .whenNotMatchedInsert(values =
    {
      "tgt.payment_id":"src.payment_id",
      "tgt.loan_id":"src.loan_id",
       "tgt.payment_date":"src.payment_date",
       "tgt.payment_amount":"src.payment_amount",
       "tgt.hashkey":"src.src_hashkey",
       "tgt.createddate":current_timestamp(),
       "tgt.createdby":lit("databricks"),
       "tgt.updateddate":current_timestamp(),
       "tgt.updatedby":lit("databricks")
    }
  ) \
  .execute()

In [0]:
%sql
select * from loan_payments order by payment_id;