In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Sample sales data
data = [
    ("ProductA", "2024-01", 100),
    ("ProductA", "2024-02", 120),
    ("ProductA", "2024-03", 900),
    ("ProductB", "2024-01", 200),
    ("ProductB", "2024-02", 210),
    ("ProductB", "2024-03", 200),
    ("ProductC", "2024-03", 250),
]

columns = ["Product", "Month", "Revenue"]
df = spark.createDataFrame(data, columns)

In [0]:
# Define window partitioned by product and ordered by month
windowSpec = Window.partitionBy("Product").orderBy("Month")

# Add lag and lead columns
df_with_lag_lead = df.withColumn("Prev_Revenue", lag("Revenue", 1).over(windowSpec)) \
                     .withColumn("Next_Revenue", lead("Revenue", 1).over(windowSpec))

#df_with_lag_lead.display()

In [0]:
df_with_lag_lead_wri = df_with_lag_lead.withColumn(
    "Revenue_Gap", 
    concat(
        (((col("Revenue") - col("Prev_Revenue")) / col("Revenue")) * lit(100)).cast("string"), 
        lit('%')
    )
).withColumn("Flag",lit("Y"))
df_with_lag_lead_wri.display()

In [0]:
import json
from datetime import datetime
from pyspark.dbutils import DBUtils
from pyspark.sql import SparkSession

def log_errors_to_blob(error_messages: dict, notebook_path: str, container_path: str, file_prefix: str = "schema_log"):
    """
    Logs error messages to a JSON file in Azure Blob Storage with notebook metadata.

    Parameters:
    - error_messages: Dictionary of error keys with list of issues
    - notebook_path: Full notebook path string
    - container_path: Blob container path (e.g., "/mnt/logs/schema/")
    - file_prefix: Optional prefix for filename
    """
    #spark = SparkSession.builder.getOrCreate()
    #dbutils = DBUtils(spark)

    metadata = {
        "notebook_path": notebook_path,
        "timestamp_utc": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "user": dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get(),
        "cluster_id": dbutils.notebook.entry_point.getDbutils().notebook().getContext().clusterId().get()
    }

    payload = {
        "metadata": metadata,
        "errors": error_messages
    }

    file_name = f"{file_prefix}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.json"
    full_path = f"{container_path}/{file_name}"

    dbutils.fs.put(full_path, json.dumps(payload, indent=2), overwrite=True)
    print(f"Error log written to: {full_path}")

In [0]:
from pyspark.sql import SparkSession, DataFrame

def validate_schema(df: DataFrame, table_name: str, strict: bool = True) -> bool:
    """
    Validates schema of DataFrame against columns of a Hive table.

    Parameters:
    - df: Incoming DataFrame to validate
    - table_name: Hive table name to compare against
    - strict: If True, exact match required. If False, allows subset match

    Returns:
    - True if schema matches (based on strict mode), False otherwise
    """
    try:
        expected_cols = [field.name for field in spark.table(table_name).schema.fields]
    except Exception as e:
        print(f"Failed to retrieve schema for table '{table_name}': {e}")
        return False

    actual_cols = df.columns


    
    # Fetch schema from rro_sales_data
    table_schema = spark.table("rro.sales_data").schema
    #display(table_schema)
    # Extract column names
    expected_cols1 = [field.name for field in table_schema]
    actual_columns = set(df.columns)
    expected_set = set(expected_cols1)  
    missing = expected_set - actual_columns
    extras = actual_columns - expected_set #if strict else set()
    
    if missing:
        schema_error_dict = {
        "Missing Columns": sorted(missing)
        }

        log_errors_to_blob(
        error_messages=schema_error_dict,
        notebook_path=dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get(),
        container_path="abfss://gold@rmpyru.dfs.core.windows.net/errorlog",
        file_prefix=table_name
    )
    if extras:
        schema_error_dict = {
        "Unexpected columns (strict mode)": sorted(extras)
        }
        log_errors_to_blob(
        error_messages=schema_error_dict,
        notebook_path=dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get(),
        container_path="abfss://gold@rmpyru.dfs.core.windows.net/errorlog",
        file_prefix=table_name
    )
    
    if strict:
        return set(actual_cols) == set(expected_cols)
    else:
        return set(expected_cols).issubset(set(actual_cols))

In [0]:
dbutils.widgets.text("TableName", "rro.sales_data")
table_name = dbutils.widgets.get("TableName")
# Assuming df_with_lag_lead_wri is your processed DataFrame

from delta.tables import DeltaTable
from pyspark.sql.functions import col

merge_condition = """
source.Product = target.Product AND
source.Month = target.Month
"""

if validate_schema(df_with_lag_lead_wri, table_name, strict=False):
    #df_with_lag_lead_wri.write.mode("append").saveAsTable(table_name)
    (
    DeltaTable.forName(spark, table_name).alias("target")
    .merge(
        df_with_lag_lead_wri.alias("source"),
        merge_condition
        )
    .whenMatchedUpdateAll()   # Updates all columns if match found
    .whenNotMatchedInsertAll()  # Inserts new rows if no match
    .execute()
    )

else:
    print("Schema mismatch detected. Investigate before writing!")

In [0]:
json = 'abfss://gold@rmpyru.dfs.core.windows.net/errorlog'
df_error = spark.read.format('json')\
                .option('inferSchema',True)\
                .option('multiLine',True)\
                .option('header',True)\
                .load(f'{json}/rro.sales_data_*.json')

# Flatten into tabular format
df_report = df_error.selectExpr(
    "metadata.notebook_path as NotebookPath",
    "metadata.timestamp_utc as TimestampUTC",
    "metadata.user as Username",
    "metadata.cluster_id as ClusterID",
    "errors as Errors"
)

df_report.display()


In [0]:
%sql
--ALTER TABLE rro.sales_data SET TBLPROPERTIES ('delta.columnMapping.mode' = 'name');ALTER TABLE rro.sales_data DROP COLUMN Flag
--ALTER TABLE rro.sales_data DROP COLUMN Flag1
--delete from rro.sales_data where 1=1
select * from rro.sales_data order by Product,Month