In [None]:
from pyspark.sql.functions import col, when, unix_timestamp, hour, to_date, row_number, trim
from pyspark.sql.window import Window
from spark_session_generator import create_spark_session
from app_constants import *
from logger_impl import set_logger, flush_logs_to_minio

ModuleNotFoundError: No module named 'scripts'

In [None]:
def load_crm_cust_info(spark):

    try:
        TABLE_NAME = "crm_cust_info"
        logger_details = set_logger(SILVER_SCHEMA, TABLE_NAME)
        logger = logger_details[0]
        log_buffer = logger_details[1]
        df = spark.table(f"{ICEBERG_CATALOG}.{BRONZE_SCHEMA}.{TABLE_NAME}")
        logger.info(f"Loaded table {TABLE_NAME} from {BRONZE_SCHEMA} layer")

        window_spec = Window.partitionBy("cst_id").orderBy(df["cst_create_date"].desc())
        df_with_row_num = df.withColumn("row_num", row_number().over(window_spec))
        ranked_cust_info_df = df_with_row_num.filter((col("row_num") == 1) & (col("cst_id").isNotNull())).drop("row_num")
        ranked_cust_info_df.createOrReplaceTempView("ranked_crm_cust_info")
        logger.info("Filtered latest records for each customer")

        silver_cust_info_df = spark.sql("""SELECT cst_id,
                                        cst_key,
                                        TRIM(cst_firstname) AS cst_firstname,
                                        TRIM(cst_lastname) AS cst_lastname,
                                        CASE 
                                            WHEN UPPER(cst_marital_status) = 'S' THEN 'Single'
                                            WHEN UPPER(cst_marital_status) = 'M' THEN 'Married'
                                            ELSE 'n/a'
                                        END cst_marital_status,
                                        CASE 
                                            WHEN UPPER(cst_gndr) = 'F' THEN 'Female'
                                            WHEN UPPER(cst_gndr) = 'M' THEN 'Male'
                                            ELSE 'n/a'
                                        END cst_gndr,
                                        cst_create_date,
                                        CURRENT_TIMESTAMP() AS dwh_create_date
                                        FROM ranked_crm_cust_info
                                    """)
        logger.info(f"Transformed data for {SILVER_SCHEMA} layer")

        silver_cust_info_df.write.format(TABLE_FORMAT) \
            .mode(INSERT_MODE) \
            .insertInto(f"{ICEBERG_CATALOG}.{SILVER_SCHEMA}.{TABLE_NAME}")
        
        logger.info(f"Data written to {SILVER_SCHEMA} layer table {TABLE_NAME}")

    except Exception as e:
        logger.error(f"Failed to load table {TABLE_NAME}: {e}")
        raise ValueError
    
    finally:
        flush_logs_to_minio(logger, log_buffer, f"{SILVER_SCHEMA}_logs/{TABLE_NAME}.log")
        logger.shutdown()

In [None]:
def load_crm_prd_info(spark):

    try:
        TABLE_NAME = "crm_prd_info"
        logger_details = set_logger(SILVER_SCHEMA, TABLE_NAME)
        logger = logger_details[0]
        log_buffer = logger_details[1]
        logger.info(f"Loading table {TABLE_NAME} from {BRONZE_SCHEMA} layer")
        silver_prd_info_df = spark.sql(f"""SELECT prd_id,
                    REPLACE(SUBSTR(prd_key, 1, 5),'-', '_') AS cat_id,
                    SUBSTR(prd_key, 7, LENGTH(prd_key)) AS prd_key,
                    prd_nm,
                    COALESCE(prd_cost, 0) AS prd_cost,
                    CASE UPPER(TRIM(prd_line))
                        WHEN 'M' THEN 'Mountain'
                        WHEN 'R' THEN 'Road'
                        WHEN 'S' THEN 'Other Sales'
                        ELSE 'n/a'
                    END 
                    AS prd_line,
                    prd_start_dt,
                    LEAD(prd_start_dt) OVER (PARTITION BY prd_key ORDER BY prd_start_dt) - 1 AS prd_end_dt_new,
                    CURRENT_TIMESTAMP() AS dwh_create_date
                FROM {ICEBERG_CATALOG}.{BRONZE_SCHEMA}.{TABLE_NAME}
                """)
        
        logger.info(f"Transformed data for {SILVER_SCHEMA} layer")
        
        silver_prd_info_df.write.format(TABLE_FORMAT) \
            .mode(INSERT_MODE) \
            .insertInto(f"{ICEBERG_CATALOG}.{SILVER_SCHEMA}.{TABLE_NAME}")
        
        logger.info(f"Data written to {SILVER_SCHEMA} layer table {TABLE_NAME}")
        

    except Exception as e:
        logger.error(f"Failed to load table {TABLE_NAME}: {e}")
        raise ValueError
    
    finally:
        flush_logs_to_minio(logger, log_buffer, f"{SILVER_SCHEMA}_logs/{TABLE_NAME}.log")
        logger.shutdown()

In [None]:
def load_crm_sales_details(spark):
    
    try:
        
        TABLE_NAME = "crm_sales_details"
        logger_details = set_logger(SILVER_SCHEMA, TABLE_NAME)
        logger = logger_details[0]
        log_buffer = logger_details[1]

        silver_sales_details_df = spark.sql(f"""SELECT sls_ord_num,
                        sls_prd_key,
                        sls_cust_id,
                        CASE WHEN sls_order_dt = 0 OR LENGTH(sls_order_dt) != 8
                             THEN NULL
                             ELSE CAST(TO_DATE(CAST(sls_order_dt AS STRING), 'yyyyMMdd') AS DATE) 
                        END AS sls_order_dt,
                        CASE WHEN sls_ship_dt = 0 OR LENGTH(sls_ship_dt) != 8
                             THEN NULL
                             ELSE CAST(TO_DATE(CAST(sls_ship_dt AS STRING), 'yyyyMMdd') AS DATE) 
                        END AS sls_ship_dt,
                        CASE WHEN sls_due_dt = 0 OR LENGTH(sls_due_dt) != 8
                             THEN NULL
                             ELSE CAST(TO_DATE(CAST(sls_due_dt AS STRING), 'yyyyMMdd') AS DATE) 
                        END AS sls_due_dt,
                        CASE 
                            WHEN sls_sales IS NULL OR sls_sales <= 0 OR sls_sales != sls_quantity * ABS(sls_price)
                             THEN sls_quantity * ABS(sls_price)
                             ELSE sls_sales
                            END AS sls_sales,
                        sls_quantity,
                        CASE 
                            WHEN sls_price IS NULL OR sls_price <= 0 
                             THEN sls_sales / NULLIF(sls_quantity, 0)
                             ELSE sls_price
                        END AS sls_price,
                        CURRENT_TIMESTAMP() AS dwh_create_date
                    FROM {ICEBERG_CATALOG}.{BRONZE_SCHEMA}.{TABLE_NAME}; 
                  """)
        
        logger.info(f"Transformed data for {SILVER_SCHEMA} layer")

        # df.show(5)
        
        silver_sales_details_df.write.format(TABLE_FORMAT) \
            .mode(INSERT_MODE) \
            .insertInto(f"{ICEBERG_CATALOG}.{SILVER_SCHEMA}.{TABLE_NAME}")
        
        logger.info(f"Data written to {SILVER_SCHEMA} layer table {TABLE_NAME}")
        

    except Exception as e:
        logger.error(f"Failed to load table {TABLE_NAME}: {e}")
        raise ValueError
    
    finally:
        flush_logs_to_minio(logger, log_buffer, f"{SILVER_SCHEMA}_logs/{TABLE_NAME}.log")
        logger.shutdown()

In [None]:
def load_erp_cust_az12(spark):
    
    try:

        TABLE_NAME = "erp_cust_az12"
        logger_details = set_logger(SILVER_SCHEMA, TABLE_NAME)
        logger = logger_details[0]
        log_buffer = logger_details[1]

        silver_cust_az12_df = spark.sql(f"""SELECT 
                             CASE 
                                WHEN cid like 'NAS%' THEN SUBSTR(cid, 4,LENGTH(cid))
                                ELSE cid  
                             END AS cst_id,
                             CASE 
                                WHEN bdate > CURRENT_DATE() THEN NULL
                                ELSE bdate
                             END AS bdate,
                             CASE 
                                WHEN UPPER(TRIM(gen)) IN ('F', 'FEMALE') THEN 'Female'
                                WHEN UPPER(TRIM(gen)) IN ('M', 'MALE') THEN 'Male'
                                ELSE 'n/a'
                             END as gen,
                             CURRENT_TIMESTAMP() AS dwh_create_date
                            FROM {ICEBERG_CATALOG}.{BRONZE_SCHEMA}.{TABLE_NAME}
                  """)
        
        logger.info(f"Transformed data for {SILVER_SCHEMA} layer")

        # df.show(5)
        
        silver_cust_az12_df.write.format(TABLE_FORMAT) \
            .mode(INSERT_MODE) \
            .insertInto(f"{ICEBERG_CATALOG}.{SILVER_SCHEMA}.{TABLE_NAME}")
        
        logger.info(f"Data written to {SILVER_SCHEMA} layer table {TABLE_NAME}")
        

    except Exception as e:
        logger.error(f"Failed to load table {TABLE_NAME}: {e}")
        raise ValueError
    
    finally:
        flush_logs_to_minio(logger, log_buffer, f"{SILVER_SCHEMA}_logs/{TABLE_NAME}.log")
        logger.shutdown()

In [None]:
def load_erp_loc_a101(spark):
    
    try:
        TABLE_NAME = "erp_loc_a101"
        logger_details = set_logger(SILVER_SCHEMA, TABLE_NAME)
        logger = logger_details[0]
        log_buffer = logger_details[1]

        silver_loc_a101_df = spark.sql(f"""SELECT 
                             REPLACE(cid, '-','') as cid,
                             CASE 
                                WHEN TRIM(cntry) = 'DE' THEN 'Germany'
                                WHEN TRIM(cntry) IN ('US', 'USA') THEN 'United States'
                                WHEN TRIM(cntry) = '' OR TRIM(cntry) IS NULL THEN 'n/a'
                                ELSE TRIM(cntry)
                             END AS cntry,
                             CURRENT_TIMESTAMP() AS dwh_create_date
                            FROM {ICEBERG_CATALOG}.{BRONZE_SCHEMA}.{TABLE_NAME}
                  """)
        
        logger.info(f"Transformed data for {SILVER_SCHEMA} layer")
        
        silver_loc_a101_df.write.format(TABLE_FORMAT) \
            .mode(INSERT_MODE) \
            .insertInto(f"{ICEBERG_CATALOG}.{SILVER_SCHEMA}.{TABLE_NAME}")
        
        logger.info(f"Data written to {SILVER_SCHEMA} layer table {TABLE_NAME}")
        

    except Exception as e:
        logger.error(f"Failed to load table {TABLE_NAME}: {e}")
        raise ValueError
    
    finally:
        flush_logs_to_minio(logger, log_buffer, f"{SILVER_SCHEMA}_logs/{TABLE_NAME}.log")
        logger.shutdown()

In [None]:
def load_erp_px_cat_g1v2(spark):
    
    try:
        TABLE_NAME = "erp_px_cat_g1v2"
        logger_details = set_logger(SILVER_SCHEMA, TABLE_NAME)
        logger = logger_details[0]
        log_buffer = logger_details[1]

        silver_erp_px_cat_g1v2_df = spark.sql("""SELECT 
                             id,
                             cat,
                             subcat,
                             maintenance,
                             CURRENT_TIMESTAMP() AS dwh_create_date
                            FROM erp_px_cat_g1v2
                  """)
        
        logger.info(f"Transformed data for {SILVER_SCHEMA} layer")
        
        silver_erp_px_cat_g1v2_df.write.format(TABLE_FORMAT) \
            .mode(INSERT_MODE) \
            .insertInto(f"{ICEBERG_CATALOG}.{SILVER_SCHEMA}.{TABLE_NAME}")
        
        logger.info(f"Data written to {SILVER_SCHEMA} layer table {TABLE_NAME}")
        

    except Exception as e:
        logger.error(f"Failed to load table {TABLE_NAME}: {e}")
        raise ValueError
    
    finally:
        flush_logs_to_minio(logger, log_buffer, f"{SILVER_SCHEMA}_logs/{TABLE_NAME}.log")
        logger.shutdown()

In [None]:
def main():
    
    spark = create_spark_session()
    
    load_crm_cust_info(spark)
    load_crm_prd_info(spark)
    load_crm_sales_details(spark)
    load_erp_cust_az12(spark)
    load_erp_loc_a101(spark)
    load_erp_px_cat_g1v2(spark)