In [0]:
pip install azure-storage-blob azure-storage-file-datalake

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, expr, concat_ws, col
from azure.storage.blob import BlobServiceClient

# ----------------------------------------------------------------------------------
# 1. SPARK CONFIGURATION & AZURE STORAGE AUTHENTICATION
# ----------------------------------------------------------------------------------

# Azure Storage account details (using SAS Token)
STORAGE_ACCOUNT_NAME = "cdmo"
SAS_TOKEN = "sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2025-02-01T13:22:40Z&st=2025-01-31T05:22:40Z&spr=https&sig=eNjMZTrl03xT4e2cf5nA2fmHglRbbQaFYgTnqWaECF4%3D"  # Replace with your SAS token
CONTAINERS = ["02-silver", "03-gold", "config"]
METADATA_CONTAINER = "config"
METADATA_FILE_NAME = "metadata_config_20250130.csv"

# Initialize Spark Session
spark = SparkSession.builder.appName("GoldLayerProcessing").getOrCreate()

# Configure Spark to use SAS Token for each relevant container
for container in CONTAINERS:
    spark.conf.set(f"fs.azure.sas.{container}.{STORAGE_ACCOUNT_NAME}.blob.core.windows.net", SAS_TOKEN)

# Initialize BlobServiceClient using SAS Token
blob_service_client = BlobServiceClient(
    account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
    credential=SAS_TOKEN
)

# ----------------------------------------------------------------------------------
# 2. LOAD METADATA FILE FROM BLOB STORAGE (wasbs://)
# ----------------------------------------------------------------------------------

def load_metadata():
    """Loads metadata file from Azure Blob Storage and filters for Gold layer processing."""
    metadata_path = f"wasbs://{METADATA_CONTAINER}@{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{METADATA_FILE_NAME}"
    try:
        print(f"🔄 Reading metadata file from {metadata_path}...")
        metadata_df = spark.read.format("csv").option("header", "true").load(metadata_path).fillna("")
        
        # Convert to list of dictionaries for faster lookup
        metadata_list = metadata_df.toPandas().to_dict(orient="records")

        # Filter for 'Gold' layer datasets
        metadata_list = [row for row in metadata_list if row.get("Layer", "").strip().lower() == "gold"]
        
        print(f"📌 Filtered metadata for 'Gold' layer: {len(metadata_list)} entries found.")
        return metadata_list

    except Exception as e:
        print(f"❌ Error loading metadata file: {e}")
        return []

# ----------------------------------------------------------------------------------
# 3. APPLY GOLD LAYER TRANSFORMATIONS
# ----------------------------------------------------------------------------------

TRANSFORMATION_RULES = {
    "customerfeedback": {"column": "Sentiment", "rule": "CASE WHEN Rating >= 4 THEN 'Positive' ELSE 'Negative' END"},
    "manufacturebatch": {"column": "BatchStatus", "rule": "CASE WHEN Status = 'Completed' THEN 'Closed' ELSE 'Open' END"},
    "productformula": {"column": "PrimaryIngredientList", "rule": "concat_ws(', ', PrimaryIngredients)"},
    "sales": {"column": "TotalRevenue", "rule": "Quantity * TotalAmount"},
    "supplier": {"column": "SupplierDetails", "rule": "concat_ws(' - ', SupplierName, Material)"},
}

def transform_gold_layer(df, dataset_name):
    """Applies Gold layer transformations dynamically based on dataset rules."""
    df = df.withColumn("ProcessedTimestamp", current_timestamp())

    if dataset_name in TRANSFORMATION_RULES:
        rule = TRANSFORMATION_RULES[dataset_name]
        df = df.withColumn(rule["column"], expr(rule["rule"]))

    print(f"✅ Applied Gold layer transformations for: {dataset_name}")
    return df

# ----------------------------------------------------------------------------------
# 4. PERFORM DELTA MERGE INTO GOLD LAYER
# ----------------------------------------------------------------------------------

def merge_into_gold_layer(source_df, target_path, unique_key):
    """Merges source data into Gold layer Delta table based on unique key."""
    try:
        if spark._jsparkSession.catalog().tableExists(f"delta.`{target_path}`"):
            target_df = spark.read.format("delta").load(target_path)
        else:
            print(f"🚀 Writing new table at {target_path}")
            source_df.write.format("delta").mode("overwrite").save(target_path)
            return

        # Register DataFrames as temporary views
        source_df.createOrReplaceTempView("source_temp_view")
        target_df.createOrReplaceTempView("target_temp_view")

        merge_query = f"""
        MERGE INTO delta.`{target_path}` AS target
        USING source_temp_view AS source
        ON target.{unique_key} = source.{unique_key}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
        spark.sql(merge_query)
        print(f"✅ Data merged successfully into {target_path}")

    except Exception as e:
        print(f"❌ Error during Delta Merge: {e}")

# ----------------------------------------------------------------------------------
# 5. PROCESS DATA BASED ON METADATA
# ----------------------------------------------------------------------------------

def process_data(metadata_list):
    """Processes data based on metadata configurations, applying transformations and merging into Gold layer."""
    for metadata in metadata_list:
        try:
            source_container, source_path = metadata["SourceContainer"], metadata["SourcePath"]
            target_container, target_path = metadata["TargetContainer"], metadata["TargetPath"]
            source_format, target_format = metadata["SourceFormat"], metadata["TargetFormat"]
            unique_key, dataset_name = metadata["UniqueKey"], metadata.get("DatasetName", "")

            if not unique_key:
                print(f"⚠️ Skipping {dataset_name}: UniqueKey missing in metadata.")
                continue

            # Construct full paths
            source_full_path = f"wasbs://{source_container}@{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{source_path}"
            target_full_path = f"wasbs://{target_container}@{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{target_path}"

            print(f"🔄 Processing: {source_full_path} -> {target_full_path}")

            # Read source data
            source_df = spark.read.format(source_format).load(source_full_path)

            # Apply Gold Layer transformations
            gold_df = transform_gold_layer(source_df, dataset_name)

            # Perform Delta Merge
            merge_into_gold_layer(gold_df, target_full_path, unique_key)

        except Exception as e:
            print(f"❌ Error processing {metadata.get('SourcePath', '')}: {e}")

# ----------------------------------------------------------------------------------
# 6. MAIN EXECUTION
# ----------------------------------------------------------------------------------

if __name__ == "__main__":
    metadata_list = load_metadata()
    if metadata_list:
        process_data(metadata_list)
    else:
        print("⚠️ No metadata found for Gold layer processing.")


🔄 Reading metadata file from wasbs://config@cdmo.blob.core.windows.net/metadata_config_20250130.csv...
📌 Filtered metadata for 'Gold' layer: 5 entries found.
🔄 Processing: wasbs://02-silver@cdmo.blob.core.windows.net/customerfeedback_transformed/ -> wasbs://03-gold@cdmo.blob.core.windows.net/customer_feedback_aggregated/
✅ Applied Gold layer transformations for: 
🚀 Writing new table at wasbs://03-gold@cdmo.blob.core.windows.net/customer_feedback_aggregated/
🔄 Processing: wasbs://02-silver@cdmo.blob.core.windows.net/manufacturebatch_transformed/ -> wasbs://03-gold@cdmo.blob.core.windows.net/manufacture_batch_enriched/
✅ Applied Gold layer transformations for: 
🚀 Writing new table at wasbs://03-gold@cdmo.blob.core.windows.net/manufacture_batch_enriched/
🔄 Processing: wasbs://02-silver@cdmo.blob.core.windows.net/productformula_transformed/ -> wasbs://03-gold@cdmo.blob.core.windows.net/product_formula_enriched/
✅ Applied Gold layer transformations for: 
🚀 Writing new table at wasbs://03-go

In [0]:
from pyspark.sql import SparkSession

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "bC98SSCcVIihXncqHoAabvjhZd9xyeXYucIb4zNydDd+NDxj3yxPT6BZoGbHiytAhl9IsdiHTQ3C+AStHPhiYA=="
container_name = "03-gold"

# Initialize Spark session
spark = SparkSession.builder.appName("GoldLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the gold layer
tables = [
    "customer_feedback_aggregated",
    "manufacture_batch_enriched",
    "product_formula_enriched",
    "sales_aggregated",
    "supplier_data_enriched"
]

# Validate table counts and sample data
def validate_gold_tables(container_name, tables):
    try:
        for table in tables:
            # Construct the full path for each table in the gold layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"Validating table: {table} at {table_path}")
            
            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

                # Display a sample of the data
                print(f"Sample data for table: {table}")
                df.show(5, truncate=False)

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating gold tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_gold_tables(container_name, tables)


Validating table: customer_feedback_aggregated at wasbs://03-gold@cdmo.blob.core.windows.net/customer_feedback_aggregated/
Table: customer_feedback_aggregated, Record Count: 100
Sample data for table: customer_feedback_aggregated
+---+------------------------------------+------------------------------------+------------------------------------+------+--------+------------+-----------------------------------------------------------------------------------------------------------------+-----------------------+-----------------------+
|id |FeedbackID                          |ProductID                           |CustomerID                          |Rating|Comments|FeedbackDate|Filename                                                                                                         |LoadTimestamp          |ProcessedTimestamp     |
+---+------------------------------------+------------------------------------+------------------------------------+------+--------+------------+---------

In [0]:
from pyspark.sql import SparkSession

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "bC98SSCcVIihXncqHoAabvjhZd9xyeXYucIb4zNydDd+NDxj3yxPT6BZoGbHiytAhl9IsdiHTQ3C+AStHPhiYA=="
container_name = "03-gold"

# Initialize Spark session
spark = SparkSession.builder.appName("GoldLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the Gold Layer
tables = {
    "customer_feedback_aggregated": "FeedbackID",
    "manufacture_batch_enriched": "BatchID",
    "product_formula_enriched": "ProductID",
    "sales_aggregated": "OrderID",
    "supplier_data_enriched": "SupplierID"
}

# Validate table counts and sample data
def validate_gold_tables(container_name, tables):
    try:
        for table, primary_key in tables.items():
            # Construct the full path for each table in the Gold Layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"\nValidating table: {table} at {table_path}")
            
            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

                # Check for duplicate primary keys
                duplicate_count = df.groupBy(primary_key).count().filter("count > 1").count()
                print(f"Table: {table}, Duplicate Primary Key Records: {duplicate_count}")

                # Display a sample of the data
                print(f"Sample data for table: {table}")
                df.display()

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating gold tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_gold_tables(container_name, tables)
