In [1]:
import findspark
findspark.init(spark_home = "/home/thanhphat/BigData/spark-3.5.0-bin-hadoop3")

In [2]:
from pyspark.sql import SparkSession

import traceback
import pyspark.sql.functions as f
import pyspark.sql.types as t

project_name = "Global_Electronics_Retailer"

In [3]:
# Config
    # Number of executor: 2
    # 2 CPU for each executor
    # 2g memory for each executor

# Create SparkSession
spark = SparkSession.builder.master("local[4]") \
    .appName("Bronze_to_Silver") \
    .config("spark.sql.warehouse.dir", f"hdfs://localhost:9000/lakehouse/warehouse/LH_{project_name}") \
    .config("spark.sql.catalogImplementation", "hive").enableHiveSupport() \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.parquet.vorder.enabled", "true") \
    .config("spark.sql.shuffle.partitions", 100) \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Get all config
spark.sparkContext.getConf().getAll()

24/06/29 12:09:24 WARN Utils: Your hostname, thanhphat-inspiron-5406-2n1 resolves to a loopback address: 127.0.1.1; using 192.168.1.8 instead (on interface wlp0s20f3)
24/06/29 12:09:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/06/29 12:09:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


24/06/29 12:09:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/06/29 12:09:29 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


[('spark.app.name', 'Bronze_to_Silver'),
 ('spark.executor.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),
 ('spark.executor.memory', '2g'),
 ('spark.master', 'local[4]'),
 ('spark.driver.host', '192.168.1.8'),
 ('spar

In [4]:
import sys
sys.path.append("/home/thanhphat/PersonalProject/Global_Electronics_Retailer/source")

from modules.Extraction import *
from modules.HDFSUtils import *
from modules.LogUtils import *
from modules.Metadata import *
from modules.Load import *


# Instance for modules
extraction = Extraction()
hdfsUtils = HDFSUtils()
logUtils = LogUtils() 
metadata = Metadata()
loadHive = Load()

# Define base_path
lakewarehouse_db = f"LH_{project_name}"
lakehouse_table_path = f"hdfs://localhost:9000/lakehouse/LH_{project_name}/Tables"
log_path = f"hdfs://localhost:9000/lakehouse/LH_{project_name}/Files/log"

In [5]:
executionDate = str(spark.sql("SELECT CURRENT_DATE()").collect()[0][0])

# Partition Execution Date
parse_execution = executionDate.split("-")
year = parse_execution[0]
month = parse_execution[1]
day = parse_execution[2]

[Stage 0:>                                                          (0 + 0) / 1]

[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

## Metadata Table Action 

In [6]:
# Read metadata action
# metadata_action = metadata.read_metadata_action("admin", "admin", "metadata", "config_table", \
#                                                 "CusDB -> Bronze")

from airflow.models import Variable
metadata_action = Variable.get(key = "metadata_action", deserialize_json = True, default_var = None)


# Define for log job
batch_run = hdfsUtils.check_batch_run(project_name, executionDate) - 1
start_time = ""
end_time = ""
error = ""
status = ""
source_row_read = 0
numInserted = 0
numUpdated = 0


from delta.tables import *


for metadata in metadata_action:

    # None df
    df = None

    task_id = metadata["task_id"]
    task_name = metadata["task_name"]
    source_connection = metadata["source_connection"]
    target_database = metadata["target_database"]
    source_folder = metadata["source_folder"].lower()
    target_table = metadata["target_table"].lower()
    phase = metadata["phase"]

    # Start time for check
    start_time = spark.sql(''' SELECT CURRENT_TIMESTAMP() as current_time ''') \
                        .collect()[0]["current_time"].strftime('%Y-%m-%d %H:%M:%S')
    

    try:
        # New df path
        new_path_version = hdfsUtils.get_new_version(executionDate, project_name, source_folder)

        df = spark.read.format("parquet").load(new_path_version)


        # Transformation
            # Task 6
        if source_folder == "customers":
            
            df = df.withColumnRenamed("State Code", "StateCode") \
                   .withColumnRenamed("Zip Code", "ZipCode")
        
            # Task = 7, Stores
        elif source_folder == "stores":

            df = df.withColumnRenamed("Square Meters", "SquareMeters") \
                   .withColumnRenamed("Open Date", "OpenDate")
            
            # Task = 8, Products
        elif source_folder == "products":

            df = df.withColumnRenamed("Product Name", "ProductName") \
                   .withColumnRenamed("Unit Cost USD", "Unit_Cost_USD") \
                   .withColumnRenamed("Unit Price USD", "Unit_Price_USD")
            
            # Task = 9, Sales
        elif source_folder == "sales":

            df = df.withColumnRenamed("Order Number", "OrderNumber") \
                   .withColumnRenamed("Line Item", "LineItem") \
                   .withColumnRenamed("Order Date", "OrderDate") \
                   .withColumnRenamed("Delivery Date", "DeliveryDate") \
                   .withColumnRenamed("Currency Code", "CurrencyCode")
            
            # Task = 10, Exchange_Rates
        elif source_folder == "exchange_rates":

            df = df.withColumn("Exchange", f.col("Exchange").cast("Float"))


        # df.show()
        deltaTablePath = f"{lakehouse_table_path}/{target_table}"

        loadHive.writeInit(df, spark, lakehouse_table_path, lakewarehouse_db, target_table)
            
        source_row_read = df.count()
        numInserted = df.count()
        numUpdated = 0

        # if DeltaTable.isDeltaTable(spark, deltaTablePath):

        #     # Condition for Upsert
        #     mergeKeyExpr = " AND ".join(f"target.{col} = source.{col}" for col in df.columns) 

        #     # Upsert(Update existing, Insert new) data
        #     deltaTable = DeltaTable.forPath(spark, deltaTablePath)
        #     deltaTable.alias("target").merge(
        #         df.alias("source"),
        #         mergeKeyExpr
        #     ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

        #     # Get information
        #     history = deltaTable.history(1).select("operationMetrics")
        #     operationMetrics = history.collect()[0]["operationMetrics"]

        #     source_row_read = df.count()
        #     numInserted = operationMetrics["numTargetRowsInserted"]
        #     numUpdated = operationMetrics["numTargetRowsUpdated"]
        # else:
        #     loadHive.writeInit(df, spark, lakehouse_table_path, lakewarehouse_db, target_table)
            
        #     source_row_read = df.count()
        #     numInserted = df.count()
        #     numUpdated = 0

        # Load to delta
    except:
        error = traceback.format_exc()
        status = "Failed"

        print("Task ", task_id, " ", status)

    else:
        error = ""
        status = "Success"
        print("Task ", task_id, " ", status)


    df_log = logUtils.log_data(batch_run, task_name, source_connection, target_database, f"parquet_{source_folder}",
                 target_table, start_time, end_time, source_row_read, numInserted, numUpdated, "", 
                 "", error, status, phase, t, spark)

    df_log.write.mode("append").format("parquet").save(f"{log_path}/{executionDate}/batch_{batch_run}/")

[Stage 2:>                                                          (0 + 1) / 1]

                                                                                

[Stage 3:>                                                          (0 + 1) / 1]

                                                                                

24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.metastore.wm.default.pool.size does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.llap.task.scheduler.preempt.independent does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.llap.output.format.arrow does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.tez.llap.min.reducer.per.executor does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.arrow.root.allocator.limit does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.vectorized.use.checked.expressions does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.tez.dynamic.semijoin.reduction.for.mapjoin does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.vectorized.complex.types.enabled does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.server2.wm.worker.threads does not exist
24/06/29 12:10:35 WARN HiveConf: HiveConf of name hive.repl.partitions.dump.

24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.metastore.wm.default.pool.size does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.llap.task.scheduler.preempt.independent does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.llap.output.format.arrow does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.tez.llap.min.reducer.per.executor does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.arrow.root.allocator.limit does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.vectorized.use.checked.expressions does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.tez.dynamic.semijoin.reduction.for.mapjoin does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.vectorized.complex.types.enabled does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.server2.wm.worker.threads does not exist
24/06/29 12:10:38 WARN HiveConf: HiveConf of name hive.repl.partitions.dump.

Sat Jun 29 12:10:39 ICT 2024 Thread[Thread-4,5,main] java.io.FileNotFoundException: derby.log (Permission denied)
24/06/29 12:10:40 WARN DriverDataSource: Registered driver with driverClassName=org.apache.derby.jdbc.EmbeddedDriver was not found, trying direct instantiation.


----------------------------------------------------------------
Sat Jun 29 12:10:40 ICT 2024:
Booting Derby version The Apache Software Foundation - Apache Derby - 10.14.2.0 - (1828579): instance a816c00e-0190-6267-1031-000012132550 
on database directory /home/thanhphat/BigData/apache-hive-3.1.3-bin/bin/metastore_db with class loader sun.misc.Launcher$AppClassLoader@64a294a6 
Loaded from file:/home/thanhphat/BigData/spark-3.5.0-bin-hadoop3/jars/derby-10.14.2.0.jar
java.vendor=Private Build
java.runtime.version=1.8.0_412-8u412-ga-1~22.04.1-b08
user.dir=/
os.name=Linux
os.arch=amd64
os.version=6.5.0-35-generic
derby.system.home=null


Database Class Loader started - derby.database.classpath=''


24/06/29 12:10:43 WARN DriverDataSource: Registered driver with driverClassName=org.apache.derby.jdbc.EmbeddedDriver was not found, trying direct instantiation.


24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.metastore.wm.default.pool.size does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.llap.task.scheduler.preempt.independent does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.llap.output.format.arrow does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.tez.llap.min.reducer.per.executor does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.arrow.root.allocator.limit does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.vectorized.use.checked.expressions does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.tez.dynamic.semijoin.reduction.for.mapjoin does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.vectorized.complex.types.enabled does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.server2.wm.worker.threads does not exist
24/06/29 12:10:44 WARN HiveConf: HiveConf of name hive.repl.partitions.dump.

24/06/29 12:10:53 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


24/06/29 12:10:55 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`lh_global_electronics_retailer`.`silver_products` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


24/06/29 12:10:56 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.metastore.wm.default.pool.size does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.llap.task.scheduler.preempt.independent does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.llap.output.format.arrow does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.tez.llap.min.reducer.per.executor does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.arrow.root.allocator.limit does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.vectorized.use.checked.expressions does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.tez.dynamic.semijoin.reduction.for.mapjoin does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.vectorized.complex.types.enabled does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.server2.wm.worker.threads does not exist
24/06/29 12:10:57 WARN HiveConf: HiveConf of name hive.repl.partitions.dump.

[Stage 5:>                                                          (0 + 1) / 1]

                                                                                

Task  8   Success


[Stage 11:>                                                         (0 + 4) / 4]





                                                                                

[Stage 14:>                                                         (0 + 1) / 1]

                                                                                

24/06/29 12:13:59 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`lh_global_electronics_retailer`.`silver_customer` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


Task  6   Success


[Stage 22:>                                                         (0 + 4) / 4]



                                                                                

[Stage 25:>                                                         (0 + 1) / 1]

                                                                                

24/06/29 12:15:16 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`lh_global_electronics_retailer`.`silver_stores` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


Task  7   Success


[Stage 36:>                                                         (0 + 1) / 1]

                                                                                

24/06/29 12:15:30 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`lh_global_electronics_retailer`.`silver_exchange_rates` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


Task  10   Success






                                                                                

[Stage 47:>                                                         (0 + 1) / 1]                                                                                

24/06/29 12:16:34 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`lh_global_electronics_retailer`.`silver_sales` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


Task  9   Success




                                                                                