In [None]:
# Imports
from pyspark.sql.functions import *
from pyspark.sql import Window
from delta.tables import *
import datetime
import time


In [None]:
# Enable vorder
spark.conf.set("spark.sql.parquet.vorder.enabled", "true")


In [None]:
# Enable zorder
spark.conf.set("spark.sql.parquet.zorder.enabled", "true")


In [None]:
# Set global variables
global intIndentationNumSpaces
intIndentationNumSpaces = 2


In [None]:
# Returns the number of defined spaces for the intIndentationLevel
def fnGetIndentationString (intIndentationLevel):
    global intIndentationNumSpaces
    return " " * (intIndentationNumSpaces * intIndentationLevel)


In [None]:
# Return the duration between the parameter (fltStartTime) and current time as a string
def fnGetDurationAsString (fltStartTime):
    # Get current time
    fltEndTime = time.time()

    # Set query
    strSelect = f"""
  with TAB as
       (
        select --ElapsedTime,
               (int((int(ElapsedTime) - (int((int(ElapsedTime) - (int((int(ElapsedTime) - (int(ElapsedTime) % 60)) / 60) % 60)) % 3600)) / 24) / 86400)) as numDay,
               (int(((int(ElapsedTime) - (int((int(ElapsedTime) - (int(ElapsedTime) % 60)) / 60) % 60)) / 3600)) % 24)                                   as numHour,
               (int((int(ElapsedTime) - (int(ElapsedTime) % 60)) / 60) % 60)                                                                             as numMinutes,
               (int(ElapsedTime) % 60)                                                                                                                   as numSeconds,
               round((ElapsedTime - int(ElapsedTime)) * 1000)                                                                                            as numMiliSeconds
          from (select {fltEndTime - fltStartTime} as ElapsedTime)        
       )
select --ElapsedTime,
       concat(cast(TAB.numDay as string), ".",
              right(concat("00", cast(TAB.numHour as string)), 2), ":",
              right(concat("00", cast(TAB.numMinutes as string)), 2), ":",
              right(concat("00", cast(TAB.numSeconds as string)), 2), ".",
              right(concat("000", cast(TAB.numMiliSeconds as string)), 3), "h"
        ) as strDuration
  from TAB
"""

    # Calculate duration
    strDuration = (
        spark
            .sql(strSelect)
            .first()[0]
    )

    return strDuration


In [None]:
# Get the max value for column in table
def fnGetMaxColumnInTable (strColumnName, strTableName, intIndentationLevel = 0):
    fltStartTime   = time.time()
    strNumSpaces   = fnGetIndentationString(intIndentationLevel)
    strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{strCurrentDate}: {strNumSpaces}Executing - fnGetMaxColumnInTable('{strColumnName}', '{strTableName}', '{intIndentationLevel}').")

    # Get max value
    intMax = (
        spark
            .sql(f"select int(ifnull(max({strColumnName}), 0)) as intMax from {strTableName}")
            .first()[0]
    )

    strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    strDuration    = fnGetDurationAsString(fltStartTime)
    print(f"{strCurrentDate}: {strNumSpaces}Finished  - fnGetMaxColumnInTable('{strColumnName}', '{strTableName}', '{intIndentationLevel}') in '{strDuration}'.")

    return intMax


In [None]:
# Write the dataframe (dfTableData) into the delta table (strTableName) using mode (strMode)
def fnWriteToDeltaTable (strTableName, strMode, dfTableData, intIndentationLevel = 0):
    fltStartTime   = time.time()
    strNumSpaces   = fnGetIndentationString(intIndentationLevel)
    strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{strCurrentDate}: {strNumSpaces}Executing - fnWriteToDeltaTable('{strTableName}', '{strMode}', 'dataframe', '{intIndentationLevel}').")

    # Set variables
    strFormat = "delta"
    strLocation = f"Tables/{strTableName}"
    
    # Create the table if not exists
    if (not(spark.catalog.tableExists(strTableName))):

        strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"{strCurrentDate}: {strNumSpaces}  Creating table '{strTableName}'.")

        (
            dfTableData
                .write
                .format(strFormat)
                .save(strLocation)
        )
    else:
        if (strMode == "overwrite"):
            # Re-create the table and insert the data in the table
            strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"{strCurrentDate}: {strNumSpaces}  Recreated and Inserting table '{strTableName}'.")

            (
                dfTableData
                    .write
                    .mode(strMode)
                    .option("overwriteSchema", "true")
                    .format(strFormat)
                    .save(strLocation)
            )
        else:
            # Insert the data in the table
            strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"{strCurrentDate}: {strNumSpaces}  Inserting table '{strTableName}'.")

            (
                dfTableData
                    .write
                    .mode(strMode)
                    .format(strFormat)
                    .insertInto(strTableName)
            )

    strCurrentDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    strDuration    = fnGetDurationAsString(fltStartTime)
    print(f"{strCurrentDate}: {strNumSpaces}Finished  - fnWriteToDeltaTable('{strTableName}', '{strMode}', 'dataframe', '{intIndentationLevel}') in '{strDuration}'.")

    return True
