In [0]:
pip install azure-storage-blob azure-storage-file-datalake

In [0]:
import csv
from azure.storage.blob import BlobServiceClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, lit, input_file_name
from io import StringIO

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXXXXXX"
container_name = "config"
metadata_file_name = "metadata_config_20250127.csv"

# Set Spark configuration for Azure Blob Storage
spark = SparkSession.builder.appName("MetadataDrivenLoads").getOrCreate()
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# Initialize BlobServiceClient
connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key}"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Load Metadata File from ADLS
def load_metadata():
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=metadata_file_name)

        # Download the metadata content
        metadata_content = blob_client.download_blob().readall().decode("utf-8")
        print(f"Successfully downloaded metadata file: {metadata_file_name}")

        # Parse CSV content and filter for 'Bronze' layer
        csv_reader = csv.DictReader(StringIO(metadata_content))
        metadata_list = [row for row in csv_reader if row["Layer"].strip().lower() == "bronze"]

        print(f"Filtered metadata for 'Bronze' layer: {len(metadata_list)} entries found.")
        return metadata_list
    except Exception as e:
        print(f"Error loading metadata file: {e}")
        raise

# Check if files exist in the source path
def files_exist_in_path(container, source_path):
    try:
        file_system_client = blob_service_client.get_container_client(container)
        paths = file_system_client.walk_blobs(name_starts_with=source_path)
        files = [blob.name for blob in paths]
        return files if files else None
    except Exception as e:
        print(f"Error checking files in path '{source_path}': {e}")
        return None

# Process Data Based on Metadata
def process_data(metadata_list):
    for metadata in metadata_list:
        try:
            # Extract metadata details
            source_container = metadata["SourceContainer"]
            source_path = metadata["SourcePath"]
            source_format = metadata["SourceFormat"]
            target_container = metadata["TargetContainer"]
            target_path = metadata["TargetPath"]
            target_format = metadata["TargetFormat"]
            add_timestamp = metadata["AddTimestamp"].lower() == "true"

            # Check if files exist in the source path
            files = files_exist_in_path(source_container, source_path)
            if not files:
                print(f"No files found in source path: {source_path}. Skipping dataset.")
                continue

            # Construct full paths
            source_full_path = f"wasbs://{source_container}@{storage_account_name}.blob.core.windows.net/{source_path}"
            target_full_path = f"wasbs://{target_container}@{storage_account_name}.blob.core.windows.net/{target_path}"

            print(f"Processing: {source_full_path} -> {target_full_path}")

            # Read source data and include filename in each row
            source_df = (
                spark.read.format(source_format)
                .option("header", "true")
                .load(source_full_path)
                .withColumn("Filename", input_file_name())  # Add filename column
            )

            # Apply transformations if specified
            if add_timestamp:
                source_df = source_df.withColumn("LoadTimestamp", current_timestamp())

            # Save to target
            source_df.write.format(target_format).mode("append").save(target_full_path)
            print(f"Data successfully saved to {target_full_path}")

        except Exception as e:
            print(f"Error processing data for Source: {metadata['SourcePath']}: {e}")

# Main Execution
if __name__ == "__main__":
    # Load metadata filtered for 'Bronze' layer
    metadata_list = load_metadata()

    # Process data based on metadata
    process_data(metadata_list)


In [0]:
from azure.storage.filedatalake import DataLakeServiceClient
import os

# Azure Storage connection details
account_name = "cdmo"
account_key = "XXXXXXXXXXXXXX"
file_system_name = "00-landing"

# Initialize DataLakeServiceClient
service_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key,
)

# Function to list all files in a directory recursively
def list_files(service_client, file_system_name, directory_path):
    try:
        file_system_client = service_client.get_file_system_client(file_system_name)
        paths = file_system_client.get_paths(path=directory_path)
        files = [path.name for path in paths if not path.is_directory]
        return files
    except Exception as e:
        print(f"Error listing files in directory '{directory_path}': {e}")
        return []

# Function to copy a file from source to target
def copy_file(service_client, file_system_name, source_path, target_path):
    try:
        file_system_client = service_client.get_file_system_client(file_system_name)
        source_file_client = file_system_client.get_file_client(source_path)
        target_file_client = file_system_client.get_file_client(target_path)

        # Start copy operation
        copy_operation = target_file_client.start_copy_from_url(source_file_client.url)
        print(f"Copy initiated from '{source_path}' to '{target_path}'")

        # Wait for copy operation to complete
        props = target_file_client.get_file_properties()
        while props.copy.status == "pending":
            props = target_file_client.get_file_properties()
        print(f"File copied successfully to '{target_path}'")
    except Exception as e:
        print(f"Error copying file from '{source_path}' to '{target_path}': {e}")

# Function to delete a file after it is copied
def delete_file(service_client, file_system_name, file_path):
    try:
        file_system_client = service_client.get_file_system_client(file_system_name)
        file_client = file_system_client.get_file_client(file_path)
        file_client.delete_file()
        print(f"Deleted file: {file_path}")
    except Exception as e:
        print(f"Error deleting file '{file_path}': {e}")

# Function to move files from incoming to archive
def move_files_to_archive(service_client, file_system_name, source_base_dir, target_base_dir):
    try:
        files = list_files(service_client, file_system_name, source_base_dir)

        if not files:
            print(f"No files found in the source directory: {source_base_dir}")
            return

        for file in files:
            # Generate source and target paths
            relative_path = file.replace(source_base_dir, "").lstrip("/")
            source_path = file
            target_path = os.path.join(target_base_dir, relative_path)

            # Copy the file
            copy_file(service_client, file_system_name, source_path, target_path)

            # Delete the original file
            delete_file(service_client, file_system_name, source_path)

    except Exception as e:
        print(f"Error moving files from '{source_base_dir}' to '{target_base_dir}': {e}")

# Main Execution
if __name__ == "__main__":
    source_base_dir = "data/incoming"
    target_base_dir = "data/archive"

    move_files_to_archive(service_client, file_system_name, source_base_dir, target_base_dir)


In [0]:
container_name = "01-bronze"

# Initialize Spark session
spark = SparkSession.builder.appName("BronzeLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the bronze layer
tables = [
    "customerfeedback",
    "manufacturebatch",
    "productformula",
    "sales",
    "supplier"
]

# Validate table counts
def validate_bronze_tables(container_name, tables):
    try:
        for table in tables:
            # Construct the full path for each table in the bronze layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"Validating table: {table} at {table_path}")
            
            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating bronze tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_bronze_tables(container_name, tables)


In [0]:
from pyspark.sql import SparkSession

# Azure Storage connection details
storage_account_name = "cdmo"
storage_account_key = "XXXXXXXXXXXX"
container_name = "01-bronze"

# Initialize Spark session
spark = SparkSession.builder.appName("BronzeLayerValidation").getOrCreate()

# Set the Spark configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# List of tables in the Bronze layer
tables = [
    "customerfeedback",
    "manufacturebatch",
    "productformula",
    "sales",
    "supplier"
]

# Validate table counts and display records
def validate_bronze_tables(container_name, tables):
    try:
        for table in tables:
            # Construct the full path for each table in the Bronze layer
            table_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{table}/"

            print(f"Validating table: {table} at {table_path}")

            # Read the table into a DataFrame
            try:
                df = spark.read.format("delta").load(table_path)

                # Get the count of records
                record_count = df.count()
                print(f"Table: {table}, Record Count: {record_count}")

                # Display the first 5 records for validation
                print(f"Displaying the first 5 records for table: {table}")
                df.display(5)

            except Exception as e:
                print(f"Error reading table '{table}': {e}")
    except Exception as e:
        print(f"Error validating bronze tables: {e}")

# Main Execution
if __name__ == "__main__":
    validate_bronze_tables(container_name, tables)
