In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, unix_timestamp, hour, to_date, row_number, trim
import logging
from io import StringIO, BytesIO
from minio import Minio
from minio.error import S3Error
import datetime
from pyspark.sql.window import Window

In [None]:

HIVE_URI = 'thrift://hive-metastore:9083'
MINIO_ACCESS_KEY = 'FrmF5fXO0bxpBepjVUSX'
MINIO_SECRET_KEY = '5RgPTIToiUPa16HAWnRv3KcsE7y21Oo3RPA3QXTb'
MINIO_ENDPOINT = "minio:9000"
MINIO_BUCKET = "logs"


spark = SparkSession.builder \
    .appName("IcebergAsDefaultCatalog") \
    .config('spark.jars.packages', 
            'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,'
            'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
            'software.amazon.awssdk:bundle:2.17.178,'
            'software.amazon.awssdk:url-connection-client:2.17.178') \
    .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.spark_catalog.warehouse", "s3a://warehouse/") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.hive.metastore.sasl.enabled", "false") \
    .getOrCreate()

## Start Spark Session
print("Spark Running")


In [None]:
spark.conf.set("spark.sql.iceberg.handle-timestamp-without-timezone", "true")

In [None]:
# In-memory log buffer
log_buffer = StringIO()

# Configure logger
log_handler = logging.StreamHandler(log_buffer)
log_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))

logger = logging.getLogger("SilverLoader")
logger.setLevel(logging.INFO)
logger.addHandler(log_handler)
logger.addHandler(logging.StreamHandler())  # Optional: Also log to stdout

In [None]:
# Upload logs from buffer to MinIO
def flush_logs_to_minio(object_name):
    try:
        client = Minio(
            MINIO_ENDPOINT,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=False  # Set to True if using HTTPS
        )
        if not client.bucket_exists(MINIO_BUCKET):
            client.make_bucket(MINIO_BUCKET)

        content = log_buffer.getvalue().encode("utf-8")
        client.put_object(
            MINIO_BUCKET,
            object_name,
            data=BytesIO(content),
            length=len(content),
            content_type='text/plain'
        )
        logger.info(f"Uploaded log to MinIO at {MINIO_BUCKET}/{object_name}")
        
        # Clear the buffer after upload
        log_buffer.truncate(0)
        log_buffer.seek(0)
        
    except S3Error as e:
        logger.error(f"Failed to upload log: {e}")

In [18]:
def load_gold_customers():
    
    try:
        crm_cust_df = spark.table("spark_catalog.silver.crm_cust_info")
        crm_cust_df.createOrReplaceTempView("silver_crm_cust_info")

        erp_cust_df = spark.table("spark_catalog.silver.erp_cust_az12")
        erp_cust_df.createOrReplaceTempView("silver_erp_cust_info")

        erp_cust_loc_df = spark.table("spark_catalog.silver.erp_loc_a101")
        erp_cust_loc_df.createOrReplaceTempView("silver_erp_cust_loc")


        gold_customer = spark.sql("""
                                  SELECT
                                  ROW_NUMBER() OVER (ORDER BY ci.cst_id) AS customer_key,
                                  ci.cst_id AS customer_id,
                                  ci.cst_key AS customer_number,
                                  ci.cst_firstname AS first_name,
                                  ci.cst_lastname AS last_name,
                                  la.cntry AS country,
                                  ci.cst_marital_status AS marital_status,
                                  CASE 
                                    WHEN ci.cst_gndr = 'n/a' THEN COALESCE(ca.gen, 'n/a')
                                    ELSE ci.cst_gndr 
                                  END AS gender,
                                  ca.bdate AS birthdate,
                                  ci.cst_create_date AS create_date,
                                  CURRENT_TIMESTAMP() AS dwh_load_date
                                  FROM silver_crm_cust_info ci
                                  LEFT JOIN silver_erp_cust_info ca
                                    ON ci.cst_key = ca.cid
                                  LEFT JOIN silver_erp_cust_loc la
                                  ON ci.cst_key = la.cid
                                  """
                            )
        
        # gold_customer.show()

        gold_customer.write.format("iceberg") \
            .mode("overwrite") \
            .insertInto(f"spark_catalog.gold.dim_customers")
        
        logger.info("Data written to silver layer table crm_cust_info")


    except Exception as e:
        logger.error(f"Error in load_gold_customers: {e}")
    finally:
        flush_logs_to_minio(f"gold_logs/gold_customer.log")

In [None]:
load_gold_customers()

In [23]:
def load_gold_products():
    
    try:
        crm_prd_info_df = spark.table("spark_catalog.silver.crm_prd_info").filter("prd_end_dt is null")
        crm_prd_info_df.createOrReplaceTempView("silver_crm_prd_info")

        erp_px_cat_g1v2_df = spark.table("spark_catalog.silver.erp_px_cat_g1v2")
        erp_px_cat_g1v2_df.createOrReplaceTempView("silver_erp_px_cat_g1v2")

        gold_products = spark.sql("""
                                    SELECT 
                                    ROW_NUMBER() OVER (ORDER BY pn.prd_start_dt, pn.prd_key) AS product_key,
                                  pn.prd_id AS product_id,
                                  pn.prd_key AS product_number,
                                  pn.prd_nm AS product_name,
                                  pn.cat_id AS category_id,
                                  pc.cat AS category,
                                  pc.subcat AS subcategory,
                                  pc.maintenance,
                                  pn.prd_cost AS cost,
                                  pn.prd_line AS product_line,
                                  pn.prd_start_dt AS start_date,
                                  CURRENT_TIMESTAMP() AS dwh_load_date
                                  FROM silver_crm_prd_info pn
                                  LEFT JOIN silver_erp_px_cat_g1v2 pc
                                    ON pn.cat_id = pc.id
                                  WHERE pn.prd_end_dt IS NULL                                 
                                  """
                            )
        
        # gold_products.show()


        gold_products.write.format("iceberg") \
            .mode("overwrite") \
            .insertInto(f"spark_catalog.gold.dim_products")
        
        logger.info("Data written to gold layer table products")


    except Exception as e:
        logger.error(f"Error in load_gold_customers: {e}")
    finally:
        flush_logs_to_minio(f"gold_logs/gold_products.log")

In [24]:
load_gold_products()

25/07/07 03:50:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/07 03:50:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/07 03:50:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/07 03:50:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/07/07 03:50:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Data written to gold layer table products
Uploaded log to MinIO at logs/gold_logs/gold_products.log
