In [1]:
# import libraries

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lit, count, when, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, BooleanType
from datetime import datetime
from delta.tables import DeltaTable
import os


StatementMeta(, b8fa049e-3950-4552-935d-91b906435247, 3, Finished, Available, Finished)

In [None]:
# initialize session
spark = SparkSession.builder \
    .appName("run_materialization") \
    .getOrCreate()

StatementMeta(, , -1, Waiting, , Waiting)

#### Input parameters

In [None]:
workspace = 'BUNN_Foundation_NONPROD'
lakehouse = 'silver_sapecc_lakehouse'
inputsourceschema = 'materialized_etl'
inputsource = 'orders_etl'
outputtargetschema = 'materialized_t'
outputtarget = 'orders'
update_control_table = 0
load_option = 'TR' 
run_start = datetime.now()


StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# insert into materialization_log

spark.sql(f"""
INSERT INTO materialization_log (schema_name, table_name, job_run_timestamp, run_id, run_step, run_timestamp, statement_text, lakehouse_name)
VALUES ('{inputsourceschema}', '{inputsource}', '{run_start}', 10, 'begin materialization', current_timestamp(), '----------', '{lakehouse}')
""")

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# update control table

if update_control_table == 1:
    # update materialization_control if update_control_table = 1
    spark.sql(f"""
    UPDATE materialization_control
    SET control_start_timestamp = current_timestamp()
    WHERE etl_database = '{inputsourceschema}'
    AND etl_view = '{inputsource}'
    AND lakehouse_name = '{lakehouse}'
    AND load_option = '{load_option}'
    """)
elif load_option in ('CLDUI'):
    # update materialization_control for specific load options
    spark.sql(f"""
    UPDATE materialization_control
    SET control_start_timestamp = CASE WHEN control_start_timestamp IS NULL THEN '1990-01-01 00:00:00' ELSE control_start_timestamp END
    WHERE etl_database = '{inputsourceschema}'
    AND etl_view = '{inputsource}'
    AND lakehouse_name = '{lakehouse}'
    AND load_option = '{load_option}'
    """)
else:
    # Insert into materialization_control if no matching record exists
    spark.sql(f"""
    INSERT INTO materialization_control (etl_database, etl_view, status_code, load_option, extract_start_timestamp, control_start_timestamp, lakehouse_name)
    SELECT '{inputsourceschema}', '{inputsource}', 'active', '{load_option}', '{run_start}', '{run_start}', '{lakehouse}'
    WHERE NOT EXISTS (
        SELECT 1
        FROM materialization_control
        WHERE etl_database = '{inputsourceschema}'
        AND etl_view = '{inputsource}'
        AND load_option = '{load_option}'
        AND lakehouse_name = '{lakehouse}'
    )
    """)

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# update materialization load table execution times

spark.sql(f"""
update materialization_load
    set last_execution = current_timestamp
    WHERE 1=1
    and lakehouse_name = '{lakehouse}'
    and input_source_schema = '{inputsourceschema}'
    and input_source = '{inputsource}'
    and output_target_schema = '{outputtargetschema}'
    and output_target = '{outputtarget}'
    and load_option = '{load_option}'
""")

StatementMeta(, , -1, Waiting, , Waiting)

#### Metadata checks

In [None]:
# define input/output path

source_table_path  = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse}.Lakehouse/Tables/{inputsourceschema}/{inputsource}"
target_table_path = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse}.Lakehouse/Tables/{outputtargetschema}/{outputtarget}"
log_table_path = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/utilities_lakehouse.Lakehouse/Tables/materialization_log"

#print(source_schema)
#df = spark.read.format("delta").load(output_table)
#df.createOrReplaceTempView("orders")
#display(df.head(5))

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# define log schema explicitly

log_schema = StructType([
    StructField("lakehouse_name", StringType(), True),
    StructField("schema_name", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("job_run_timestamp", TimestampType(), True),
    StructField("run_id", IntegerType(), True), 
    StructField("run_step", StringType(), True),
    StructField("run_timestamp", TimestampType(), True),
    StructField("record_count", IntegerType(), True),
    StructField("step_fail", StringType(), True),
    StructField("statement_text", StringType(), True)
])

# insert log function

def insert_log(lakehouse_name, schema_name, table_name, job_run_timestamp,
             run_id, run_step, run_timestamp, record_count, step_fail, statement_text):
    log_data = [
        Row(
            lakehouse_name=lakehouse_name,
            schema_name=schema_name,
            table_name=table_name,
            job_run_timestamp=job_run_timestamp,
            run_id=run_id,
            run_step=run_step,
            run_timestamp=run_timestamp,
            record_count = record_count,
            step_fail=step_fail,
            statement_text=statement_text
        )
    ]
    log_df = spark.createDataFrame(log_data, schema=log_schema)
    log_df.write.format("delta").mode("append").save(log_table_path)

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
#1 Check if the source table exist

try:
    if not mssparkutils.fs.exists(source_table_path):
         raise Exception(f"Source table {inputsource} does not exist.")
        # print(f"Source table {inputsource} exists.")
    # Log success
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=20, 
        run_step='Success: Source object validation', 
        run_timestamp=datetime.now(), 
        record_count = None,
        step_fail=None, 
        statement_text=f"Check table {source_table_path}" 
    )
except Exception as e:
    # failure
    insert_log(
        lakehouse_name=lakehouse, 
        schema_name=inputsourceschema, 
        table_name=inputsource, 
        job_run_timestamp=run_start, 
        run_id=21, 
        run_step='Failure: Source object validation', 
        run_timestamp=datetime.now(), 
        record_count = 1,
        step_fail=True,  
        statement_text=f"Check table {source_table_path}" 
    )
    raise Exception(f"Source table {inputsource} does not exist.")


StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
#2 Check if the target table exist

try:
    if not mssparkutils.fs.exists(target_table_path):
        raise Exception(f"Target table {outputtarget} does not exist.")
      # print(f"Target table {outputtarget} exists.")
 
    # success
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=30, 
        run_step='Success: Target object validation', 
        run_timestamp=datetime.now(), 
        record_count = None,
        step_fail=None, 
        statement_text=f"Check table {target_table_path}" 
    )
except Exception as e:
    # failure
    insert_log(
        lakehouse_name=lakehouse, 
        schema_name=inputsourceschema, 
        table_name=inputsource, 
        job_run_timestamp=run_start, 
        run_id=31, 
        run_step='Failure: Target object validation', 
        run_timestamp=datetime.now(), 
        record_count = 1,
        step_fail=True,  
        statement_text=f"Check table {target_table_path}" 
    )
    raise Exception(f"Target table {outputtarget} does not exist.")


StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
#3 source and traget columns check

try:
    source_df = spark.read.format("delta").load(source_table_path)
    target_df = spark.read.format("delta").load(target_table_path)

    source_column_count = len(source_df.columns)
    target_column_count = len(target_df.columns)

    # print(f"Source table column count: {source_column_count}")
    # print(f"Target table column count: {target_column_count}")

    if not source_column_count == target_column_count:
        raise Exception(f"Missing columns in source/target object")
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,  
        table_name=inputsource,  
        job_run_timestamp=run_start,  
        run_id=40, 
        run_step='Success: Column counts match', 
        run_timestamp=datetime.now(), 
        record_count = None,
        step_fail= None, 
        statement_text=f"Check table {inputsource}" 
        )
except Exception as e:
    # failure
    insert_log(
        lakehouse_name=lakehouse, 
        schema_name=inputsourceschema, 
        table_name=inputsource, 
        job_run_timestamp=run_start, 
        run_id=41, 
        run_step='Failure: Missing column(s) in source/target object', 
        run_timestamp=datetime.now(), 
        record_count = 1,
        step_fail=True,  
        statement_text=f"Check table {inputsource}" 
    )
    raise Exception(f"Missing column(s) in source/target object.")

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# Print source table schema
source_df = spark.read.format("delta").load(source_table_path)
source_df.printSchema()

# Print target table schema
target_df = spark.read.format("delta").load(target_table_path)
target_df.printSchema()

#### Data loads

In [None]:
1# Truncate and Reload (TR)

if load_option == "TR":
    try:
        try:
            # Check if the path of the Delta table exists
            if DeltaTable.isDeltaTable(spark, target_table_path):
                # Load the Delta table
                delta_table = DeltaTable.forPath(spark, target_table_path)

                # Delete all rows from the target
                delta_table.delete("1 = 1") 
                print("Truncate operation completed successfully")             

                # Log success for truncate
                insert_log(
                    lakehouse_name=lakehouse,
                    schema_name=inputsourceschema,
                    table_name=inputsource,
                    job_run_timestamp=run_start,
                    run_id=101, 
                    run_step="Success: Truncate target object",
                    run_timestamp=datetime.now(),
                    record_count=None,
                    step_fail=None,
                    statement_text=None
                )  
            else:
                raise Exception(f"Delta table not found at path {target_table_path}")

        except Exception as truncate_error:
            # failure for truncate
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=191,
                run_step="Execution Error: Truncate target object",
                run_timestamp=datetime.now(),
                record_count=1,
                step_fail="True",
                statement_text=f"Error - {str(truncate_error)}"
            )
            print(f"FAIL (190) Truncate failed. Error - {str(truncate_error)}")
            raise truncate_error

        try:
            
            input_df = spark.read.format("delta").load(source_table_path)
            input_df.write.format("delta").mode("append").save(target_table_path)

            # successful load
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=102,
                run_step="Success: Load target object",
                run_timestamp=datetime.now(),
                record_count=input_df.count(),
                step_fail=None,
                statement_text=None
            )

        except Exception as load_error:
            # failure for load
            insert_log(
                lakehouse_name=lakehouse,
                schema_name=inputsourceschema,
                table_name=inputsource,
                job_run_timestamp=run_start,
                run_id=192, 
                run_step="Execution Error: Load target object",
                run_timestamp=datetime.now(),
                record_count=1,
                step_fail="True",
                statement_text=f"Error - {str(load_error)}"
            )
            print(f"FAIL (191) TR load failed. Error - {str(load_error)}")
            raise load_error

    except Exception as general_error:
        # general failure
        insert_log(
            lakehouse_name=lakehouse,
            schema_name=inputsourceschema,
            table_name=inputsource,
            job_run_timestamp=run_start,
            run_id=199,
            run_step="Execution Error: General failure in TR load process",
            run_timestamp=datetime.now(),
            record_count=1,
            step_fail="True",
            statement_text=f"Error - {str(general_error)}"
        )
        print(f"FAIL (199) General failure in TR load process. Error - {str(general_error)}")
        raise general_error
else:
    try:
        raise ValueError(f" Unsupported load_option - {load_option}")
    except Exception as e:
        # failure for unsupported load_option
        insert_log(
            lakehouse_name=lakehouse,
            schema_name=inputsourceschema,
            table_name=inputsource,
            job_run_timestamp=run_start,
            run_id=999,  
            run_step="Execution Error: Unsupported load_option",
            run_timestamp=datetime.now(),
            record_count=1,
            step_fail="True",
            statement_text=f"Error - {str(e)}"
        )
        print(f"FAIL (999) Unsupported load_option. Error - {str(e)}")
        raise e        

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# Load complete check

if load_option in ["TR"]:
    insert_log(
        lakehouse_name=lakehouse,
        schema_name=inputsourceschema,
        table_name=inputsource,
        job_run_timestamp=run_start,
        run_id=200, 
        run_step="Load complete",
        run_timestamp=datetime.now(),
        record_count=None,
        step_fail=None,
        statement_text=None
    )

    # Update materialization_control table
    spark.sql(f"""
        UPDATE materialization_control
        SET 
            extract_start_timestamp = '{run_start}',
            extract_end_timestamp = '{datetime.now()}'
        WHERE 1 = 1
        AND etl_database = '{inputsourceschema}'
        AND etl_view = '{inputsource}'
        AND load_option = '{load_option}'
        AND lakehouse_name = '{lakehouse}'
    """)

    # update materialization_load table
    spark.sql(f"""
        UPDATE materialization_load
        SET
            last_successful_execution = '{datetime.now()}'
        WHERE 1 = 1
        AND lakehouse_name = '{lakehouse}'
        AND input_source_schema = '{inputsourceschema}'
        AND input_source = '{inputsource}'
        AND output_target_schema = '{outputtargetschema}'
        AND output_target = '{outputtarget}'
        AND load_option = '{load_option}'
    """)

    print(f"Load completed successfully for load_option: {load_option}.")

StatementMeta(, , -1, Waiting, , Waiting)

In [None]:
# stop spark session
spark.stop()

StatementMeta(, , -1, Waiting, , Waiting)

#### Validate results

In [None]:
# Verify the results in materialization_log
display(spark.sql(f"""
SELECT * FROM materialization_log
WHERE schema_name = '{inputsourceschema}' AND table_name = '{inputsource}'
"""))

# Verify the results in materialization_control
display(spark.sql(f"""
SELECT * FROM materialization_control
WHERE etl_database = '{inputsourceschema}' AND etl_view = '{inputsource}'
"""))

In [None]:
%%sql
select * from materialization_load where 1=1 and lakehouse_name ='silver_sapecc_lakehouse'