In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, unix_timestamp, hour, to_date, row_number, trim
import logging
from io import StringIO, BytesIO
from minio import Minio
from minio.error import S3Error
import datetime
from pyspark.sql.window import Window

In [7]:

HIVE_URI = 'thrift://hive-metastore:9083'
MINIO_ACCESS_KEY = 'FrmF5fXO0bxpBepjVUSX'
MINIO_SECRET_KEY = '5RgPTIToiUPa16HAWnRv3KcsE7y21Oo3RPA3QXTb'
MINIO_ENDPOINT = "minio:9000"
MINIO_BUCKET = "logs"


spark = SparkSession.builder \
    .appName("IcebergAsDefaultCatalog") \
    .config('spark.jars.packages', 
            'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,'
            'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
            'software.amazon.awssdk:bundle:2.17.178,'
            'software.amazon.awssdk:url-connection-client:2.17.178') \
    .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.spark_catalog.warehouse", "s3a://warehouse/") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.hive.metastore.sasl.enabled", "false") \
    .getOrCreate()

## Start Spark Session
print("Spark Running")


Spark Running


In [8]:
spark.conf.set("spark.sql.iceberg.handle-timestamp-without-timezone", "true")

In [9]:
# In-memory log buffer
log_buffer = StringIO()

# Configure logger
log_handler = logging.StreamHandler(log_buffer)
log_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))

logger = logging.getLogger("SilverLoader")
logger.setLevel(logging.INFO)
logger.addHandler(log_handler)
logger.addHandler(logging.StreamHandler())  # Optional: Also log to stdout

In [10]:
# Upload logs from buffer to MinIO
def flush_logs_to_minio(object_name):
    try:
        client = Minio(
            MINIO_ENDPOINT,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=False  # Set to True if using HTTPS
        )
        if not client.bucket_exists(MINIO_BUCKET):
            client.make_bucket(MINIO_BUCKET)

        content = log_buffer.getvalue().encode("utf-8")
        client.put_object(
            MINIO_BUCKET,
            object_name,
            data=BytesIO(content),
            length=len(content),
            content_type='text/plain'
        )
        logger.info(f"Uploaded log to MinIO at {MINIO_BUCKET}/{object_name}")
        
        # Clear the buffer after upload
        log_buffer.truncate(0)
        log_buffer.seek(0)
        
    except S3Error as e:
        logger.error(f"Failed to upload log: {e}")

In [None]:
def load_gold_customers():
    
    try:
        crm_cust_df = spark.table("spark_catalog.silver.crm_cust_info")
        crm_cust_df.createOrReplaceTempView("silver_crm_cust_info")

        erp_cust_df = spark.table("spark_catalog.silver.erp_cust_az12")
        erp_cust_df.createOrReplaceTempView("silver_erp_cust_info")

        erp_cust_loc_df = spark.table("spark_catalog.silver.erp_loc_a101")
        erp_cust_loc_df.createOrReplaceTempView("silver_erp_cust_loc")


        gold_customer = spark.sql("""
                                  SELECT
                                  ROW_NUMBER() OVER (ORDER BY ci.cst_id) AS customer_key,
                                  ci.cst_id AS customer_id,
                                  ci.cst_key AS customer_number,
                                  ci.cst_firstname AS first_name,
                                  ci.cst_lastname AS last_name,
                                  la.cntry AS country,
                                  ci.cst_marital_status AS marital_status,
                                  CASE 
                                    WHEN ci.cst_gndr = 'n/a' THEN COALESCE(ca.gen, 'n/a')
                                    ELSE ci.cst_gndr 
                                  END AS gender,
                                  ca.bdate AS birthdate,
                                  ci.cst_create_date AS create_date,
                                  FROM silver_crm_cust_info ci
                                  LEFT JOIN silver_erp_cust_info ca
                                    ON ci.cst_key = ca.cid
                                  LEFT JOIN silver_erp_cust_loc la
                                  ON ci.cst_key = la.cid
                                  """
                            )
        
        gold_customer.show()

        gold_customer.write.format("iceberg") \
            .mode("overwrite") \
            .insertInto(f"spark_catalog.gold.dim_customers")
    except Exception as e:
        logger.error(f"Error in load_gold_customers: {e}")
    finally:
        flush_logs_to_minio(f"gold_logs/gold_customer.log")

In [21]:
load_gold_customers()

Uploaded log to MinIO at logs/gold_logs/gold_customer.log
Uploaded log to MinIO at logs/gold_logs/gold_customer.log


+------+----------+-------------+------------+------------------+--------+---------------+
|cst_id|   cst_key|cst_firstname|cst_lastname|cst_marital_status|cst_gndr|cst_create_date|
+------+----------+-------------+------------+------------------+--------+---------------+
| 11000|AW00011000|          Jon|        Yang|           Married|    Male|     2025-10-06|
| 11001|AW00011001|       Eugene|       Huang|            Single|    Male|     2025-10-06|
| 11002|AW00011002|        Ruben|      Torres|           Married|    Male|     2025-10-06|
| 11003|AW00011003|      Christy|         Zhu|            Single|  Female|     2025-10-06|
| 11004|AW00011004|    Elizabeth|     Johnson|            Single|  Female|     2025-10-06|
| 11005|AW00011005|        Julio|        Ruiz|            Single|    Male|     2025-10-06|
| 11006|AW00011006|        Janet|     Alvarez|            Single|  Female|     2025-10-06|
| 11007|AW00011007|        Marco|       Mehta|           Married|    Male|     2025-10-06|