In [0]:
pip install azure-storage-blob azure-storage-file-datalake

In [0]:
import csv
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, trim, col, lit, concat_ws, expr
from io import StringIO

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXXXXX"
container_name = "config"
metadata_file_name = "metadata_config_20250127.csv"

# Set Spark configuration for Azure Blob Storage
spark = SparkSession.builder.appName("GoldLayerProcessing").getOrCreate()
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# Initialize BlobServiceClient
connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key}"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Load Metadata File from ADLS
def load_metadata():
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=metadata_file_name)

        # Download the metadata content
        metadata_content = blob_client.download_blob().readall().decode("utf-8")
        print(f"Successfully downloaded metadata file: {metadata_file_name}")

        # Parse CSV content and filter for 'Gold' layer
        csv_reader = csv.DictReader(StringIO(metadata_content))
        metadata_list = [row for row in csv_reader if row["Layer"].strip().lower() == "gold"]

        print(f"Filtered metadata for 'Gold' layer: {len(metadata_list)} entries found.")
        return metadata_list
    except Exception as e:
        print(f"Error loading metadata file: {e}")
        raise

# Apply transformations specific to the Gold Layer
def transform_gold_layer(source_df, dataset_name):
    try:
        # Common transformations for all datasets
        source_df = source_df.withColumn("ProcessedTimestamp", current_timestamp())

        # Dataset-specific transformations
        if dataset_name == "customerfeedback":
            source_df = source_df.withColumn("Sentiment", expr("CASE WHEN Rating >= 4 THEN 'Positive' ELSE 'Negative' END"))

        elif dataset_name == "manufacturebatch":
            source_df = source_df.withColumn("BatchStatus", expr("CASE WHEN Status = 'Completed' THEN 'Closed' ELSE 'Open' END"))

        elif dataset_name == "productformula":
            source_df = source_df.withColumn("PrimaryIngredientList", concat_ws(", ", col("PrimaryIngredients")))

        elif dataset_name == "sales":
            source_df = source_df.withColumn("TotalRevenue", expr("Quantity * TotalAmount"))

        elif dataset_name == "supplier":
            source_df = source_df.withColumn("SupplierDetails", concat_ws(" - ", col("SupplierName"), col("Material")))

        print(f"Gold layer transformations applied successfully for dataset: {dataset_name}")
        return source_df
    except Exception as e:
        print(f"Error during Gold layer transformations: {e}")
        raise

# Perform Delta Merge into Gold Layer
def merge_into_gold_layer(source_df, target_path, unique_key):
    try:
        # Read the target Gold Delta table
        if spark._jsparkSession.catalog().tableExists(f"delta.`{target_path}`"):
            target_df = spark.read.format("delta").load(target_path)
        else:
            print(f"Target path {target_path} does not exist. Writing as new table.")
            source_df.write.format("delta").mode("overwrite").save(target_path)
            return

        # Register source and target DataFrames as temporary views for SQL-based merge
        source_df.createOrReplaceTempView("source_temp_view")
        target_df.createOrReplaceTempView("target_temp_view")

        # Merge query to update existing records and insert new ones
        merge_query = f"""
        MERGE INTO delta.`{target_path}` AS target
        USING source_temp_view AS source
        ON target.{unique_key} = source.{unique_key}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
        spark.sql(merge_query)
        print(f"Data successfully merged into Gold Layer at {target_path}")
    except Exception as e:
        print(f"Error during Delta Merge: {e}")
        raise

# Process Data Based on Metadata
def process_data(metadata_list):
    for metadata in metadata_list:
        try:
            # Extract metadata details
            source_container = metadata["SourceContainer"]
            source_path = metadata["SourcePath"]
            source_format = metadata["SourceFormat"]
            target_container = metadata["TargetContainer"]
            target_path = metadata["TargetPath"]
            unique_key = metadata.get("UniqueKey")  # Use get() to handle missing keys
            dataset_name = metadata.get("DatasetName")

            # Validate UniqueKey
            if not unique_key:
                print(f"Skipping dataset {source_path}: 'UniqueKey' not provided in metadata.")
                continue

            # Construct full paths
            source_full_path = f"wasbs://{source_container}@{storage_account_name}.blob.core.windows.net/{source_path}"
            target_full_path = f"wasbs://{target_container}@{storage_account_name}.blob.core.windows.net/{target_path}"

            print(f"Processing: {source_full_path} -> {target_full_path}")

            # Read source data in Delta format
            source_df = spark.read.format("delta").load(source_full_path)

            # Apply Gold Layer transformations
            gold_df = transform_gold_layer(source_df, dataset_name)

            # Perform Delta Merge
            merge_into_gold_layer(gold_df, target_full_path, unique_key)

        except Exception as e:
            print(f"Error processing data for Source: {metadata['SourcePath']}: {e}")

# Main Execution
if __name__ == "__main__":
    # Load metadata filtered for 'Gold' layer
    metadata_list = load_metadata()

    # Process data based on metadata
    process_data(metadata_list)


In [0]:
from pyspark.sql import SparkSession

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXXXXX"
container_name = "03-gold"

# Initialize Spark session
spark = SparkSession.builder.appName("GoldLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the gold layer
tables = [
    "customer_feedback_aggregated",
    "manufacture_batch_enriched",
    "product_formula_enriched",
    "sales_aggregated",
    "supplier_data_enriched"
]

# Validate table counts and sample data
def validate_gold_tables(container_name, tables):
    try:
        for table in tables:
            # Construct the full path for each table in the gold layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"Validating table: {table} at {table_path}")
            
            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

                # Display a sample of the data
                print(f"Sample data for table: {table}")
                df.show(5, truncate=False)

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating gold tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_gold_tables(container_name, tables)


In [0]:
from pyspark.sql import SparkSession

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXXXXX"
container_name = "03-gold"

# Initialize Spark session
spark = SparkSession.builder.appName("GoldLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the Gold Layer
tables = {
    "customer_feedback_aggregated": "FeedbackID",
    "manufacture_batch_enriched": "BatchID",
    "product_formula_enriched": "ProductID",
    "sales_aggregated": "OrderID",
    "supplier_data_enriched": "SupplierID"
}

# Validate table counts and sample data
def validate_gold_tables(container_name, tables):
    try:
        for table, primary_key in tables.items():
            # Construct the full path for each table in the Gold Layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"\nValidating table: {table} at {table_path}")
            
            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

                # Check for duplicate primary keys
                duplicate_count = df.groupBy(primary_key).count().filter("count > 1").count()
                print(f"Table: {table}, Duplicate Primary Key Records: {duplicate_count}")

                # Display a sample of the data
                print(f"Sample data for table: {table}")
                df.display()

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating gold tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_gold_tables(container_name, tables)
