In [65]:
# import libraries

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lit, count, when, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, BooleanType
from datetime import datetime
from delta.tables import DeltaTable
import os


StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 67, Finished, Available, Finished)

In [66]:
# initialize session
spark = SparkSession.builder \
    .appName("run_materialization") \
    .getOrCreate()

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 68, Finished, Available, Finished)

#### Input parameters

In [67]:
workspace = 'BUNN_Foundation_NONPROD'
lakehouse = 'silver_sapecc_lakehouse'
inputsourceschema = 'materialized_etl'
inputsource = 'orders_etl'
outputtargetschema = 'materialized_t'
outputtarget = 'orders'
update_control_table = 0
load_option = 'CLDUI' 
run_start = datetime.now()

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 69, Finished, Available, Finished)

In [68]:
# Begin materialization process

spark.sql(f"""
INSERT INTO materialization_log (schema_name, table_name, job_run_timestamp, run_id, run_step, run_timestamp, statement_text, lakehouse_name)
VALUES ('{inputsourceschema}', '{inputsource}', '{run_start}', 10, 'begin materialization', current_timestamp(), null, '{lakehouse}')
""")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 70, Finished, Available, Finished)

DataFrame[]

In [69]:
# insert/update materialization control table

if update_control_table == 1:
    # update materialization_control if update_control_table = 1
    query = f"""
    UPDATE materialization_control
    SET control_start_timestamp = current_timestamp()
    WHERE etl_database = '{inputsourceschema}'
    AND etl_view = '{inputsource}'
    AND lakehouse_name = '{lakehouse}'
    AND load_option = '{load_option}'
    """
    spark.sql(query)    
else:
    # insert into materialization_control if no matching record exists
    query = f"""
    INSERT INTO materialization_control (etl_database, etl_view, status_code, load_option, extract_start_timestamp, control_start_timestamp, lakehouse_name)
    SELECT '{inputsourceschema}', '{inputsource}', 'active', '{load_option}', '{run_start}', null, '{lakehouse}'
    WHERE NOT EXISTS (
        SELECT 1
        FROM materialization_control
        WHERE etl_database = '{inputsourceschema}'
        AND etl_view = '{inputsource}'
        AND load_option = '{load_option}'
        AND lakehouse_name = '{lakehouse}'
    )
    """
    spark.sql(query)

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 71, Finished, Available, Finished)

In [70]:
# Update materialization load table execution times

spark.sql(f"""
update materialization_load
    set last_execution = current_timestamp
    WHERE 1=1
    and lakehouse_name = '{lakehouse}'
    and input_source_schema = '{inputsourceschema}'
    and input_source = '{inputsource}'
    and output_target_schema = '{outputtargetschema}'
    and output_target = '{outputtarget}'
    and load_option = '{load_option}'
""")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 72, Finished, Available, Finished)

DataFrame[num_affected_rows: bigint]

#### Metadata checks

In [71]:
# define input/output table path

source_table_path  = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse}.Lakehouse/Tables/{inputsourceschema}/{inputsource}"
target_table_path = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse}.Lakehouse/Tables/{outputtargetschema}/{outputtarget}"
log_table_path = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/utilities_lakehouse.Lakehouse/Tables/materialization_log"

#print(source_schema)
#df = spark.read.format("delta").load(output_table)
#df.createOrReplaceTempView("orders")
#display(df.head(5))

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 73, Finished, Available, Finished)

In [72]:
# define log schema explicitly

log_schema = StructType([
    StructField("lakehouse_name", StringType(), True),
    StructField("schema_name", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("job_run_timestamp", TimestampType(), True),
    StructField("run_id", IntegerType(), True), 
    StructField("run_step", StringType(), True),
    StructField("run_timestamp", TimestampType(), True),
    StructField("record_count", IntegerType(), True),
    StructField("step_fail", StringType(), True),
    StructField("statement_text", StringType(), True)
])

# insert log function

def insert_log(lakehouse_name, schema_name, table_name, job_run_timestamp,
             run_id, run_step, run_timestamp, record_count, step_fail, statement_text):
    log_data = [
        Row(
            lakehouse_name=lakehouse_name,
            schema_name=schema_name,
            table_name=table_name,
            job_run_timestamp=job_run_timestamp,
            run_id=run_id,
            run_step=run_step,
            run_timestamp=run_timestamp,
            record_count = record_count,
            step_fail=step_fail,
            statement_text=statement_text
        )
    ]
    log_df = spark.createDataFrame(log_data, schema=log_schema)
    log_df.write.format("delta").mode("append").save(log_table_path)

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 74, Finished, Available, Finished)

In [73]:
#1 Check if the source table exist
try:
    if not mssparkutils.fs.exists(source_table_path):
         raise Exception(f"Source table {inputsource} does not exist.")
        # print(f"Source table {inputsource} exists.")
    # Log success
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=20, 
        run_step='Success: Source object validation', 
        run_timestamp=datetime.now(), 
        record_count = None,
        step_fail=None, 
        statement_text=f"Check table {source_table_path}" 
    )
except Exception as e:
    # failure
    insert_log(
        lakehouse_name=lakehouse, 
        schema_name=inputsourceschema, 
        table_name=inputsource, 
        job_run_timestamp=run_start, 
        run_id=21, 
        run_step='Failure: Source object validation', 
        run_timestamp=datetime.now(), 
        record_count = 1,
        step_fail=True,  
        statement_text=f"Check table {source_table_path}" 
    )
    raise Exception(f"Source table {inputsource} does not exist.")


StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 75, Finished, Available, Finished)

In [74]:
#2 Check if the target table exist
try:
    if not mssparkutils.fs.exists(target_table_path):
        raise Exception(f"Target table {outputtarget} does not exist.")
      # print(f"Target table {outputtarget} exists.")
 
    # success
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=30, 
        run_step='Success: Target object validation', 
        run_timestamp=datetime.now(), 
        record_count = None,
        step_fail=None, 
        statement_text=f"Check table {target_table_path}" 
    )
except Exception as e:
    # failure
    insert_log(
        lakehouse_name=lakehouse, 
        schema_name=inputsourceschema, 
        table_name=inputsource, 
        job_run_timestamp=run_start, 
        run_id=31, 
        run_step='Failure: Target object validation', 
        run_timestamp=datetime.now(), 
        record_count = 1,
        step_fail=True,  
        statement_text=f"Check table {target_table_path}" 
    )
    raise Exception(f"Target table {outputtarget} does not exist.")


StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 76, Finished, Available, Finished)

In [75]:
#3 Column counts mismatch
try:
    source_df = spark.read.format("delta").load(source_table_path)
    target_df = spark.read.format("delta").load(target_table_path)

    source_column_count = len(source_df.columns)
    target_column_count = len(target_df.columns)

    # print(f"Source table column count: {source_column_count}")
    # print(f"Target table column count: {target_column_count}")

    if not source_column_count == target_column_count:
        raise Exception(f"Missing columns in source/target object")
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=40, 
        run_step='Success: Column counts match', 
        run_timestamp=datetime.now(), 
        record_count = None,
        step_fail= None, 
        statement_text=f"Check table {inputsource}" 
        )
except Exception as e:
    # failure
    insert_log(
        lakehouse_name=lakehouse, 
        schema_name=inputsourceschema, 
        table_name=inputsource, 
        job_run_timestamp=run_start, 
        run_id=41, 
        run_step='Failure: Missing column(s) in source/target object', 
        run_timestamp=datetime.now(), 
        record_count = 1,
        step_fail=True,  
        statement_text=f"Check table {inputsource}" 
    )
    raise Exception(f"Column count mismatch for {inputsource}")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 77, Finished, Available, Finished)

In [76]:
#4 Column name mismatch
try:
    source_columns = set(source_df.columns)
    target_columns = set(target_df.columns)

    # check for column mismatch
    column_name_mismatches = source_columns.symmetric_difference(target_columns)

    if column_name_mismatches:
        # mismatched column names list
        mismatch_details = ", ".join(column_name_mismatches)
        
        # log failure
        insert_log(
            lakehouse_name=lakehouse, 
            schema_name=inputsourceschema, 
            table_name=inputsource, 
            job_run_timestamp=run_start, 
            run_id=51, 
            run_step='Failure: Column names mismatch', 
            run_timestamp=datetime.now(), 
            record_count=len(column_name_mismatches),
            step_fail=True,  
            statement_text=f"Column name mismatches found {mismatch_details}" 
        )
        # Raise exception with the mismatch details
        raise Exception(f"Column name mismatches found {mismatch_details}")
    
    # Log success if no mismatches are found
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=50, 
        run_step='Success: Column names match', 
        run_timestamp=datetime.now(), 
        record_count=None,
        step_fail=None, 
        statement_text=f"Check column names for {inputsource}" 
    )
except Exception as e:
    # catching unexpected errors
    raise Exception(f"Error during column name check: {str(e)}")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 78, Finished, Available, Finished)

In [77]:
#5 Column datatype mismatch
try:
    data_type_mismatches = []

    for column in source_df.columns:
        if column in target_df.columns:
            # compare data types
            if str(source_df.schema[column].dataType) != str(target_df.schema[column].dataType):
                data_type_mismatches.append((column, str(source_df.schema[column].dataType), str(target_df.schema[column].dataType)))

    if data_type_mismatches:
        # mismatched columns 
        mismatch_details = "\n".join([f"Column: {col}, Source Data Type: {src_type}, Target Data Type: {tgt_type}" 
                                     for col, src_type, tgt_type in data_type_mismatches])
        
        # log failure with mismatch
        insert_log(
            lakehouse_name=lakehouse, 
            schema_name=inputsourceschema, 
            table_name=inputsource, 
            job_run_timestamp=run_start, 
            run_id=61, 
            run_step='Failure: Column data type mismatch', 
            run_timestamp=datetime.now(), 
            record_count=len(data_type_mismatches),
            step_fail=True,  
            statement_text=f"Data type mismatches found in the following columns:\n{mismatch_details}" 
        )
        raise Exception(f"Data type mismatches found in the following columns:\n{mismatch_details}")
    
    # log sucsess
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=60, 
        run_step='Success: Column data types match', 
        run_timestamp=datetime.now(), 
        record_count=None,
        step_fail=None, 
        statement_text=f"Check column data types for {inputsource}" 
    )
except Exception as e:
    # raise unexpected errors
    raise Exception(f"Error during data type check {str(e)}")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 79, Finished, Available, Finished)

In [None]:
# Print source table schema
source_df = spark.read.format("delta").load(source_table_path)
source_df.printSchema()

# Print target table schema
target_df = spark.read.format("delta").load(target_table_path)
target_df.printSchema()

#### Data loads

#### 1. Truncate and Reload

In [78]:
# Truncate and Reload (TR)
if load_option == "TR":
    try:
        try:
            # Load delta table
            delta_table = DeltaTable.forPath(spark, target_table_path)

            # Delete all rows from the delta table
            delta_table.delete("1 = 1") 
            print("Truncate operation completed successfully")             

            # Log success for truncate
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=101, 
                run_step="Success: Truncate target object",
                run_timestamp=datetime.now(),
                record_count=None,  # No record count for truncate
                step_fail=None,
                statement_text=None
            )  

        except Exception as truncate_error:
            # Log failure for truncate
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=191,
                run_step="Execution Error: Truncate target object",
                run_timestamp=datetime.now(),
                record_count=1,  # No change for failure
                step_fail="True",
                statement_text=f"Error - {str(truncate_error)}"
            )
            print(f"FAIL (190) Truncate failed. Error - {str(truncate_error)}")
            raise truncate_error

        try:
            # Load source data and append to target table
            input_df = spark.read.format("delta").load(source_table_path)
            input_df.write.format("delta").mode("append").save(target_table_path)

            # Log success for load
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=102,
                run_step="Success: Load target object",
                run_timestamp=datetime.now(),
                record_count=input_df.count(),  # Populate record count for success
                step_fail=None,
                statement_text=None
            )

        except Exception as load_error:
            # Log failure for load
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=192, 
                run_step="Execution Error: Load target object",
                run_timestamp=datetime.now(),
                record_count=1,  # No change for failure
                step_fail="True",
                statement_text=f"Error - {str(load_error)}"
            )
            print(f"FAIL (191) Load target failed. Error - {str(load_error)}")
            raise load_error

    except Exception as error:
        # Raise the error to be caught by the general failure block
        raise error

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 80, Finished, Available, Finished)

#### 2. LDUI (Logical Delete Update Insert)

In [79]:
# Logical delete, update and insert
if load_option == "LDUI":
    try:
        try:
            # Query metadata table
            meta_query = f"""
            SELECT source_key, target_key
            FROM bunn_meta
            WHERE source_schema = '{inputsourceschema}'
            AND source_table = '{inputsource}'
            AND target_schema = '{outputtargetschema}'
            AND target_table = '{outputtarget}'
            """
            meta_df = spark.sql(meta_query)

            # Check PK's exist for target
            if meta_df.count() == 0:
                # Log failure for primary key check
                insert_log(
                    lakehouse_name=lakehouse,
                    schema_name=inputsourceschema,
                    table_name=inputsource,
                    job_run_timestamp=run_start,
                    run_id=291,
                    run_step="Execution Error: Primary key metadata not found",
                    run_timestamp=datetime.now(),
                    record_count=1,
                    step_fail="True",
                    statement_text=f"No primary key metadata found for target_schema={inputsourceschema}, and target_table={inputsource} in bunn_meta table."
                )
                raise ValueError(f"No primary key metadata found for target_schema={inputsourceschema}, and target_table={inputsource} in bunn_meta table.")

            # Log success for primary key check
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=201, 
                run_step="Success: Primary key metadata found",
                run_timestamp=datetime.now(),
                record_count=meta_df.count(),
                step_fail=None,
                statement_text=f"Primary keys: {meta_df.collect()}"
            )

           # Collect source and target key columns
            primary_key_metadata = meta_df.collect()
            source_keys = [row["source_key"] for row in primary_key_metadata]
            target_keys = [row["target_key"] for row in primary_key_metadata]

            # Construct the merge condition dynamically
            merge_conditions = [
                f"target.{target_key} = source.{source_key}"
                for source_key, target_key in zip(source_keys, target_keys)
            ]
            merge_condition = " AND ".join(merge_conditions)

            # Load source and target tables
            source_df = spark.read.format("delta").load(source_table_path)
            target_df = DeltaTable.forPath(spark, target_table_path)

            # Get a list of columns from source and target
            source_columns = source_df.columns
            target_columns = target_df.toDF().columns

            # Exclude audit columns
            non_key_columns = [
                col for col in source_columns
                if col not in source_keys  # Exclude audit columns
                and col not in ["action_type", "row_insert_timestamp", "row_update_timestamp"] 
            ]

            # Construct the update condition to check if any non-key column has changed
            update_conditions = [
                f"source.{col} <> target.{col}" 
                for col in non_key_columns
            ]
            update_condition = " OR ".join(update_conditions)

            set_clause = {
                col: f"source.{col}" for col in non_key_columns  
            }
            set_clause.update({
                "action_type": "'U'",  # Set action_type to 'U' for updates
                "row_update_timestamp": "current_timestamp()"  # Update row_update_timestamp
            })

            # Perform merge operation
            target_df.alias("target").merge(
                source_df.alias("source"),
                merge_condition
            ).whenMatchedUpdate(
                condition=update_condition, 
                set=set_clause  
            ).whenNotMatchedInsertAll(
            ).execute()

            print("LDUI load completed successfully")

            # Log for success
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=202, 
                run_step="Success: LDUI load completed",
                run_timestamp=datetime.now(),
                record_count=source_df.count(),
                step_fail=None,
                statement_text=None
            )

        except Exception as merge_error:
            # Log for failure
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=292,
                run_step="Execution Error: LDUI load failed",
                run_timestamp=datetime.now(),
                record_count=1,
                step_fail="True",
                statement_text=f"Error - {str(merge_error)}"
            )
            print(f"FAIL (292) LDUI load failed. Error - {str(merge_error)}")
            raise merge_error

    except Exception as error:
        # Raise the error for general failure
        raise error

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 81, Finished, Available, Finished)

#### 3. DLDUI (Derived Logical Delete, Update & Insert)
#### Used to refresh aggregated or synthesized objects where no action type is available from source

In [80]:
# Derived logical delete, update and insert
if load_option == "DLDUI":
    try:
        try:
            # Query metadata table
            meta_query = f"""
            SELECT source_key, target_key
            FROM bunn_meta
            WHERE source_schema = '{inputsourceschema}'
            AND source_table = '{inputsource}'
            AND target_schema = '{outputtargetschema}'
            AND target_table = '{outputtarget}'
            """
            meta_df = spark.sql(meta_query)

            # Check PK's exist for target
            if meta_df.count() == 0:
                # Log failure for primary key check
                insert_log(
                    lakehouse_name=lakehouse,
                    schema_name=inputsourceschema,
                    table_name=inputsource,
                    job_run_timestamp=run_start,
                    run_id=391,
                    run_step="Execution Error: Primary key metadata not found",
                    run_timestamp=datetime.now(),
                    record_count=1,
                    step_fail="True",
                    statement_text=f"No primary key metadata found for target_schema={inputsourceschema}, and target_table={inputsource} in bunn_meta table."
                )
                raise ValueError(f"No primary key metadata found for target_schema={inputsourceschema}, and target_table={inputsource} in bunn_meta table.")

            # Log success for primary key check
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=301, 
                run_step="Success: Primary key metadata found",
                run_timestamp=datetime.now(),
                record_count=meta_df.count(),
                step_fail=None,
                statement_text=f"Primary keys: {meta_df.collect()}"
            )

           # Collect source and target key columns
            primary_key_metadata = meta_df.collect()
            source_keys = [row["source_key"] for row in primary_key_metadata]
            target_keys = [row["target_key"] for row in primary_key_metadata]

            # Construct the merge condition dynamically
            merge_conditions = [
                f"target.{target_key} = source.{source_key}"
                for source_key, target_key in zip(source_keys, target_keys)
            ]
            merge_condition = " AND ".join(merge_conditions)

            # Load source and target tables
            source_df = spark.read.format("delta").load(source_table_path)
            target_df = DeltaTable.forPath(spark, target_table_path)

            # Get a list of columns from source and target
            source_columns = source_df.columns
            target_columns = target_df.toDF().columns

            # Exclude audit columns
            non_key_columns = [
                col for col in source_columns
                if col not in source_keys  # Exclude primary key columns
                and col not in ["action_type", "row_insert_timestamp", "row_update_timestamp"] 
            ]

            # Construct the update condition to check if any non-key column has changed
            update_conditions = [
                f"source.{col} <> target.{col}" 
                for col in non_key_columns
            ]
            update_condition = " OR ".join(update_conditions)

            set_clause = {
                col: f"source.{col}" for col in non_key_columns  
            }
            set_clause.update({
                "action_type": "'U'",  # Set action_type to 'U' for updates
                "row_update_timestamp": "current_timestamp()"  # Update row_update_timestamp
            })

            # Perform merge operation
            target_df.alias("target").merge(
                source_df.alias("source"),
                merge_condition
            ).whenMatchedUpdate(
                condition=update_condition, 
                set=set_clause  
            ).whenNotMatchedInsertAll(
            ).whenNotMatchedBySourceUpdate(
                condition="target.action_type != 'D'",  # get records with action_type <> 'D'
                set={
                    "action_type": "'D'",  # Set action_type to 'D' for deleted records
                    "row_update_timestamp": "current_timestamp()"  # Update row_update_timestamp
                }
            ).execute()

            print("DLDUI load completed successfully")

            # Log for success
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=302, 
                run_step="Success: DLDUI load completed",
                run_timestamp=datetime.now(),
                record_count=source_df.count(),
                step_fail=None,
                statement_text=None
            )

        except Exception as merge_error:
            # Log for failure
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=392,
                run_step="Execution Error: DLDUI load failed",
                run_timestamp=datetime.now(),
                record_count=1,
                step_fail="True",
                statement_text=f"Error - {str(merge_error)}"
            )
            print(f"FAIL (392) DLDUI load failed. Error - {str(merge_error)}")
            raise merge_error

    except Exception as error:
        # Raise the error for general failure
        raise error

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 82, Finished, Available, Finished)

### 4. CLDUI (Controlled Logical Delete Update & Insert)
#### Performs targeted updates controlled by timestamp from last successful execution



In [81]:
# Controlled Logical delete, update and insert

if load_option == "CLDUI":
    try:
        try:
            # Query metadata table
            meta_query = f"""
            SELECT source_key, target_key
            FROM bunn_meta
            WHERE source_schema = '{inputsourceschema}'
            AND source_table = '{inputsource}'
            AND target_schema = '{outputtargetschema}'
            AND target_table = '{outputtarget}'
            """
            meta_df = spark.sql(meta_query)

            # Check PK's exist for target
            if meta_df.count() == 0:
                # Log failure for primary key check
                insert_log(
                    lakehouse_name=lakehouse,
                    schema_name=inputsourceschema,
                    table_name=inputsource,
                    job_run_timestamp=run_start,
                    run_id=491,
                    run_step="Execution Error: Primary key metadata not found",
                    run_timestamp=datetime.now(),
                    record_count=1,
                    step_fail="True",
                    statement_text=f"No primary key metadata found for target_schema={inputsourceschema}, and target_table={inputsource} in bunn_meta table."
                )
                raise ValueError(f"No primary key metadata found for target_schema={inputsourceschema}, and target_table={inputsource} in bunn_meta table.")

            # Log success for primary key check
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=401, 
                run_step="Success: Primary key metadata found",
                run_timestamp=datetime.now(),
                record_count=meta_df.count(),
                step_fail=None,
                statement_text=f"Primary keys: {meta_df.collect()}"
            )

           # Collect source and target key columns
            primary_key_metadata = meta_df.collect()
            source_keys = [row["source_key"] for row in primary_key_metadata]
            target_keys = [row["target_key"] for row in primary_key_metadata]

            # Construct the merge condition dynamically
            merge_conditions = [
                f"target.{target_key} = source.{source_key}"
                for source_key, target_key in zip(source_keys, target_keys)
            ]
            merge_condition = " AND ".join(merge_conditions)

            # Load source and target tables
            source_df = spark.read.format("delta").load(source_table_path)
            target_df = DeltaTable.forPath(spark, target_table_path)

            # Get a list of columns from source and target
            source_columns = source_df.columns
            target_columns = target_df.toDF().columns

            # Exclude audit columns
            non_key_columns = [
                col for col in source_columns
                if col not in source_keys  # Exclude audit key columns
                and col not in ["action_type", "row_insert_timestamp", "row_update_timestamp"] 
            ]

            # Construct the update condition to check if any non-key column has changed
            update_conditions = [
                f"source.{col} <> target.{col}" 
                for col in non_key_columns
            ]
            update_condition = " OR ".join(update_conditions)

            set_clause = {
                col: f"source.{col}" for col in non_key_columns  
            }
            set_clause.update({
                "action_type": "'U'",  # Set action_type to 'U' for updates
                "row_update_timestamp": "current_timestamp()"  # Update row_update_timestamp
            })

            # Fetch control_start_timestamp from materialization_control_table
            control_table_query = f"""
                SELECT control_start_timestamp
                FROM materialization_control
                WHERE etl_database = '{inputsourceschema}'
                AND etl_view = '{inputsource}'
                and load_option ='{load_option}'
                """

            control_df = spark.sql(control_table_query)  
            control_start_timestamp = "1900-01-01 00:00:00"

            # Check if control_df has rows
            if control_df.count() > 0:
                # Get the first row
                row = control_df.collect()[0]
                # Check if the value is not None before using it
                if row["control_start_timestamp"] is not None:
                    control_start_timestamp = row["control_start_timestamp"]

            # Pull incremental data from the source table
            source_df = spark.read.format("delta").load(source_table_path) \
            .filter(f"row_update_timestamp >= '{control_start_timestamp}'")

            # Perform merge operation
            target_df.alias("target").merge(
                source_df.alias("source"),
                merge_condition
            ).whenMatchedUpdate(
                condition=update_condition, 
                set=set_clause  
            ).whenNotMatchedInsertAll(
            ).execute()

            print("CLDUI load completed successfully")

            # Log for success
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=402, 
                run_step="Success: CLDUI load completed",
                run_timestamp=datetime.now(),
                record_count=source_df.count(),
                step_fail=None,
                statement_text=None
            )

        except Exception as merge_error:
            # Log for failure
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=492,
                run_step="Execution Error: CLDUI load failed",
                run_timestamp=datetime.now(),
                record_count=1,
                step_fail="True",
                statement_text=f"Error - {str(merge_error)}"
            )
            print(f"FAIL (492) CLDUI load failed. Error - {str(merge_error)}")
            raise merge_error

    except Exception as error:
        # Raise the error for general failure
        raise error

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 83, Finished, Available, Finished)

CLDUI load completed successfully


In [None]:
control_table_query = f"""
    SELECT control_start_timestamp
    FROM materialization_control
    WHERE etl_database = '{inputsourceschema}'
    AND etl_view = '{inputsource}'
    and load_option ='{load_option}'
    """
control_df = spark.sql(control_table_query)  
control_start_timestamp = "1900-01-01 00:00:00"

# Check if control_df has rows
if control_df.count() > 0:
    # Get the first row
    row = control_df.collect()[0]
    # Check if the value is not None before using it
    if row["control_start_timestamp"] is not None:
        control_start_timestamp = row["control_start_timestamp"]


# Pull incremental data from the source table
source_df = spark.read.format("delta").load(source_table_path) \
   .filter(f"row_update_timestamp >= '{control_start_timestamp}'")


In [43]:
# Define the query
control_table_query = f"""
    SELECT control_start_timestamp
    FROM materialization_control
    WHERE etl_database = '{inputsourceschema}'
    AND etl_view = '{inputsource}'
    AND load_option = '{load_option}'
"""

# Execute the query
control_df = spark.sql(control_table_query)
control_start_timestamp = "1900-01-01 00:00:00"


# Check if control_df has rows
if control_df.count() > 0:
    # Get the first row
    row = control_df.collect()[0]
    # Check if the value is not None before using it
    if row["control_start_timestamp"] is not None:
        control_start_timestamp = row["control_start_timestamp"]

# Print the control_start_timestamp
print("Control Start Timestamp:", control_start_timestamp)

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 45, Finished, Available, Finished)

Control Start Timestamp: 1900-01-01 00:00:00


In [82]:
# Check if source_df has any records
if source_df.count() > 0:
    # Get max timestamp from source table
    max_row_update_timestamp = source_df.selectExpr("max(row_update_timestamp)").collect()[0][0]

    # Update the control_start_timestamp in materialization_control table
    control_update_query = f"""
        MERGE INTO materialization_control AS target
        USING (
            SELECT '{inputsourceschema}' AS etl_database,
                   '{inputsource}' AS etl_view,
                   '{load_option}' AS load_option,
                   to_timestamp('{max_row_update_timestamp}') AS control_start_timestamp
        ) AS source
        ON target.etl_database = source.etl_database
           AND target.etl_view = source.etl_view
           AND target.load_option = source.load_option
        WHEN MATCHED THEN
            UPDATE SET target.control_start_timestamp = source.control_start_timestamp
        WHEN NOT MATCHED THEN
            INSERT (etl_database, etl_view, load_option, control_start_timestamp)
            VALUES (source.etl_database, source.etl_view, source.load_option, source.control_start_timestamp)
    """
    print(spark.sql(control_update_query))
else:
    print("No records to process. Skipping update of control_start_timestamp.")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 84, Finished, Available, Finished)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]


#### Exception for unsupported load option

In [83]:
# Unsupported Load Option
if load_option not in ["TR","LDUI","DLDUI","CLDUI"]:
    try:
        raise ValueError(f" Unsupported load_option - {load_option}")
    except Exception as e:
        # Log failure for unsupported load_option
        insert_log(
            lakehouse_name=lakehouse,
            schema_name=inputsourceschema,
            table_name=inputsource,
            job_run_timestamp=run_start,
            run_id=999,  
            run_step="Execution Error: Unsupported load_option",
            run_timestamp=datetime.now(),
            record_count=1,  # No change for failure
            step_fail="True",
            statement_text=f"Error - {str(e)}"
        )
        print(f"FAIL (999) Unsupported load_option. Error - {str(e)}")
        raise e

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 85, Finished, Available, Finished)

### Load complete

In [84]:
# Load complete check
if load_option == "TR":
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,
        table_name=inputsource,
        job_run_timestamp=run_start,
        run_id=200, 
        run_step="Load complete",
        run_timestamp=datetime.now(),
        record_count=None,
        step_fail=None,
        statement_text=None
    )
elif load_option == "LDUI":
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,
        table_name=inputsource,
        job_run_timestamp=run_start,
        run_id=300, 
        run_step="Load complete",
        run_timestamp=datetime.now(),
        record_count=None,
        step_fail=None,
        statement_text=None
    ) 
elif load_option == "DLDUI":
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,
        table_name=inputsource,
        job_run_timestamp=run_start,
        run_id=400, 
        run_step="Load complete",
        run_timestamp=datetime.now(),
        record_count=None,
        step_fail=None,
        statement_text=None
    )
elif load_option == "CLDUI":
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,
        table_name=inputsource,
        job_run_timestamp=run_start,
        run_id=500, 
        run_step="Load complete",
        run_timestamp=datetime.now(),
        record_count=None,
        step_fail=None,
        statement_text=None
    )
    # Update materialization_control table
    spark.sql(f"""
        UPDATE materialization_control
        SET 
            extract_start_timestamp = '{run_start}',
            extract_end_timestamp = '{datetime.now()}'
        WHERE 1 = 1
        AND etl_database = '{inputsourceschema}'
        AND etl_view = '{inputsource}'
        AND load_option = '{load_option}'
        AND lakehouse_name = '{lakehouse}'
    """)

    # update materialization_load table
    spark.sql(f"""
        UPDATE materialization_load
        SET
            last_successful_execution = '{datetime.now()}'
        WHERE 1 = 1
        AND lakehouse_name = '{lakehouse}'
        AND input_source_schema = '{inputsourceschema}'
        AND input_source = '{inputsource}'
        AND output_target_schema = '{outputtargetschema}'
        AND output_target = '{outputtarget}'
        AND load_option = '{load_option}'
    """)

    print(f"Load completed successfully for load_option: {load_option}.")

StatementMeta(, 6f1f94b2-98cf-4d7d-9b79-b816427ec58c, 86, Finished, Available, Finished)

Load completed successfully for load_option: CLDUI.


#### Validate results

In [None]:
# Verify the results in materialization_log
display(spark.sql(f"""
SELECT * FROM materialization_log
WHERE schema_name = '{inputsourceschema}' AND table_name = '{inputsource}'
"""))

# Verify the results in materialization_control
display(spark.sql(f"""
SELECT * FROM materialization_control
WHERE etl_database = '{inputsourceschema}' AND etl_view = '{inputsource}'
"""))

In [None]:
%%sql
select * from materialization_load where 1=1 and lakehouse_name ='silver_sapecc_lakehouse'