In [0]:
pip install azure-storage-blob azure-storage-file-datalake

In [0]:
import csv
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, trim, input_file_name
from io import StringIO

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXX"
container_name = "config"
metadata_file_name = "metadata_config_20250127.csv"

# Set Spark configuration for Azure Blob Storage
spark = SparkSession.builder.appName("SilverLayerProcessing").getOrCreate()
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# Initialize BlobServiceClient
connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key}"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Load Metadata File from ADLS
def load_metadata():
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=metadata_file_name)

        # Download the metadata content
        metadata_content = blob_client.download_blob().readall().decode("utf-8")
        print(f"Successfully downloaded metadata file: {metadata_file_name}")

        # Parse CSV content and filter for 'Silver' layer
        csv_reader = csv.DictReader(StringIO(metadata_content))
        metadata_list = [row for row in csv_reader if row["Layer"].strip().lower() == "silver"]

        print(f"Filtered metadata for 'Silver' layer: {len(metadata_list)} entries found.")
        return metadata_list
    except Exception as e:
        print(f"Error loading metadata file: {e}")
        raise

# Check if files exist in the source path
def files_exist_in_path(container, source_path):
    try:
        file_system_client = blob_service_client.get_container_client(container)
        paths = file_system_client.walk_blobs(name_starts_with=source_path)
        files = [blob.name for blob in paths]
        return files if files else None
    except Exception as e:
        print(f"Error checking files in path '{source_path}': {e}")
        return None

# Apply transformations specific to the Silver Layer
def transform_silver_layer(source_df):
    try:
        # Example transformations:
        string_cols = [col_name for col_name, dtype in source_df.dtypes if dtype == "string"]
        for col_name in string_cols:
            source_df = source_df.withColumn(col_name, trim(source_df[col_name]))
        
        source_df = source_df.withColumn("ProcessedTimestamp", current_timestamp())
        print("Silver layer transformations applied successfully.")
        return source_df
    except Exception as e:
        print(f"Error during Silver layer transformations: {e}")
        raise

# Perform Delta Merge into Silver Layer
def merge_into_silver_layer(source_df, target_path, unique_key):
    try:
        # Read the target Silver Delta table
        if spark._jsparkSession.catalog().tableExists(f"delta.`{target_path}`"):
            target_df = spark.read.format("delta").load(target_path)
        else:
            print(f"Target path {target_path} does not exist. Writing as new table.")
            source_df.write.format("delta").mode("overwrite").save(target_path)
            return

        # Register source and target DataFrames as temporary views for SQL-based merge
        source_df.createOrReplaceTempView("source_temp_view")
        target_df.createOrReplaceTempView("target_temp_view")

        # Merge query to update existing records and insert new ones
        merge_query = f"""
        MERGE INTO delta.`{target_path}` AS target
        USING source_temp_view AS source
        ON target.{unique_key} = source.{unique_key}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
        spark.sql(merge_query)
        print(f"Data successfully merged into Silver Layer at {target_path}")
    except Exception as e:
        print(f"Error during Delta Merge: {e}")
        raise

# Process Data Based on Metadata
def process_data(metadata_list):
    for metadata in metadata_list:
        try:
            # Extract metadata details
            source_container = metadata["SourceContainer"]
            source_path = metadata["SourcePath"]
            source_format = metadata["SourceFormat"]
            target_container = metadata["TargetContainer"]
            target_path = metadata["TargetPath"]
            unique_key = metadata.get("UniqueKey")  # Use get() to handle missing keys
            add_timestamp = metadata["AddTimestamp"].lower() == "true"

            # Validate UniqueKey
            if not unique_key:
                print(f"Skipping dataset {source_path}: 'UniqueKey' not provided in metadata.")
                continue

            # Check if files exist in the source path
            files = files_exist_in_path(source_container, source_path)
            if not files:
                print(f"No files found in source path: {source_path}. Skipping dataset.")
                continue

            # Construct full paths
            source_full_path = f"wasbs://{source_container}@{storage_account_name}.blob.core.windows.net/{source_path}"
            target_full_path = f"wasbs://{target_container}@{storage_account_name}.blob.core.windows.net/{target_path}"

            print(f"Processing: {source_full_path} -> {target_full_path}")

            # Read source data in Delta format
            source_df = spark.read.format("delta").load(source_full_path)

            # Apply Silver Layer transformations
            silver_df = transform_silver_layer(source_df)

            # Perform Delta Merge
            merge_into_silver_layer(silver_df, target_full_path, unique_key)

        except Exception as e:
            print(f"Error processing data for Source: {metadata['SourcePath']}: {e}")

# Main Execution
if __name__ == "__main__":
    # Load metadata filtered for 'Silver' layer
    metadata_list = load_metadata()

    # Process data based on metadata
    process_data(metadata_list)


In [0]:
container_name = "02-silver"

# Initialize Spark session
spark = SparkSession.builder.appName("SilverLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the bronze layer
tables = [
    "customerfeedback",
    "manufacturebatch",
    "productformula",
    "sales",
    "supplier"
]

# Validate table counts
def validate_silver_tables(container_name, tables):
    try:
        for table in tables:
            # Construct the full path for each table in the bronze layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"Validating table: {table} at {table_path}")
            
            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating silver tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_silver_tables(container_name, tables)


In [0]:
from pyspark.sql import SparkSession

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXXXXXX"
container_name = "02-silver"

# Initialize Spark session
spark = SparkSession.builder.appName("SilverLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the Bronze layer
tables = [
    "customerfeedback",
    "manufacturebatch",
    "productformula",
    "sales",
    "supplier"
]

# Validate table counts and display records
def validate_silver_tables(container_name, tables):
    try:
        for table in tables:
            # Construct the full path for each table in the Bronze layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"Validating table: {table} at {table_path}")

            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

                # Display the first 5 records for validation
                print(f"Displaying the first 5 records for table: {table}")
                df.display()

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating silver tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_silver_tables(container_name, tables)
