In [1]:
import findspark
findspark.init(spark_home = "/home/thanhphat/BigData/spark-3.5.0-bin-hadoop3")

In [2]:
from pyspark.sql import SparkSession

import traceback
import pyspark.sql.functions as f
import pyspark.sql.types as t


project_name = "Global_Electronics_Retailer"

In [7]:
# Config
    # Number of executor: 2
    # 2 CPU for each executor
    # 2g memory for each executor

# Create SparkSession
spark = SparkSession.builder.master("local[4]") \
    .appName("Bronze_to_Silver") \
    .config("spark.sql.warehouse.dir", f"hdfs://localhost:9000/lakehouse/warehouse/LH_{project_name}") \
    .config("spark.sql.catalogImplementation", "hive").enableHiveSupport() \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.parquet.vorder.enabled", "true") \
    .config("spark.sql.shuffle.partitions", 100) \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

24/06/28 22:08:36 WARN Utils: Your hostname, thanhphat-inspiron-5406-2n1 resolves to a loopback address: 127.0.1.1; using 192.168.1.8 instead (on interface wlp0s20f3)
24/06/28 22:08:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/thanhphat/BigData/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/thanhphat/.ivy2/cache
The jars for the packages stored in: /home/thanhphat/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f2315066-311d-4c12-802c-a5ac09d3a605;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 333ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   | 

In [None]:
from modules.Extraction import *
from modules.HDFSUtils import *
from modules.LogUtils import *
from modules.Metadata import *
from modules.Load import *


# Instance for modules
extraction = Extraction()
hdfsUtils = HDFSUtils()
logUtils = LogUtils() 
metadata = Metadata()
loadHive = Load()

# Define base_path
lakewarehouse_db = f"LH_{project_name}"
lakehouse_table_path = f"hdfs://localhost:9000/lakehouse/LH_{project_name}/Tables"
log_path = f"hdfs://localhost:9000/lakehouse/LH_{project_name}/Files/log"

In [None]:
executionDate = str(spark.sql("SELECT CURRENT_DATE()").collect()[0][0])

# Partition Execution Date
parse_execution = executionDate.split("-")
year = parse_execution[0]
month = parse_execution[1]
day = parse_execution[2]

## Metadata Table Action 

In [None]:
# Read metadata action
metadata_action = metadata.read_metadata_action("admin", "admin", "metadata", "config_table", \
                                                "CusDB -> Bronze")


# Define for log job
batch_run = hdfsUtils.check_batch_run(project_name, executionDate) - 1
start_time = ""
end_time = ""
error = ""
status = ""
source_row_read = 0
numInserted = 0
numUpdated = 0


from delta.tables import *


for metadata in metadata_action:

    # None df
    df = None

    task_id = metadata["task_id"]
    task_name = metadata["task_name"]
    source_connection = metadata["source_connection"]
    target_database = metadata["target_database"]
    source_table = metadata["source_table"].lower()
    target_table = metadata["target_table"]
    phase = metadata["phase"]

    # Start time for check
    start_time = spark.sql(''' SELECT CURRENT_TIMESTAMP() as current_time ''') \
                        .collect()[0]["current_time"].strftime('%Y-%m-%d %H:%M:%S')

    try:
        # New df path
        new_path_version = hdfsUtils.get_new_version(executionDate, project_name, source_table)

        df = spark.read.format("parquet").load(new_path_version)
        # df.show()
        deltaTablePath = f"{lakehouse_table_path}/{target_table}"

        if DeltaTable.isDeltaTable(spark, deltaTablePath):

            # Condition for Upsert
            mergeKeyExpr = " AND ".join(f"target.{col} = source.{col}" for col in df.columns) 

            # Upsert(Update existing, Insert new) data
            deltaTable = DeltaTable.forPath(spark, deltaTablePath)
            deltaTable.alias("target").merge(
                df.alias("source"),
                mergeKeyExpr
            ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

            # Get information
            history = deltaTable.history(1).select("operationMetrics")
            operationMetrics = history.collect()[0]["operationMetrics"]

            source_row_read = df.count()
            numInserted = operationMetrics["numTargetRowsInserted"]
            numUpdated = operationMetrics["numTargetRowsUpdated"]
        else:
            loadHive.writeInit(df, spark, lakehouse_table_path, lakewarehouse_db, \
                               target_table, new_path_version)
            
            source_row_read = df.count()
            numInserted = df.count()
            numUpdated = 0

        # Load to delta
    except:
        error = traceback.format_exc()
        status = "Failed"

        print("Task ", task_id, " ", status)

    else:
        error = ""
        status = "Success"
        print("Task ", task_id, " ", status)


    df_log = logUtils.log_data(batch_run, task_name, source_connection, target_database, target_table,
                 start_time, end_time, source_row_read, numInserted, numUpdated, "", 
                 "", error, status, phase, t, spark)

    df_log.write.mode("append").format("parquet").save(f"{log_path}/{executionDate}/batch_{batch_run}/")