In [0]:
from pyspark.sql import Row
import datetime
import json
import os

In [0]:
# utils/logging_utils

from pyspark.sql import Row
import datetime

def log_pipeline_stats(stage, stats, table_name="pipeline.pipeline_stats"):
    """
    Logs pipeline statistics to a Delta table.
    :param stage: The pipeline stage (e.g., "ingestion").
    :param stats: A dictionary of metrics to log.
    :param table_name: The name of the Delta table to store logs (default: "pipeline.pipeline_stats").
    """
    try:
        # Add common fields to the stats dictionary
        stats["stage"] = stage
        stats["timestamp"] = datetime.datetime.now()  # Keep as TIMESTAMP
        
        # Convert stats to a DataFrame
        logs_df = spark.createDataFrame([Row(**stats)])
        
        # Write logs to the Delta table
        logs_df.write.format("delta") \
               .mode("append") \
               .option("mergeSchema", "true") \
               .saveAsTable(table_name)
    
    except Exception as e:
        # Log errors if writing to the Delta table fails
        print(f"Failed to log pipeline stats for stage '{stage}' in table '{table_name}'. Error: {str(e)}")
        raise