In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

metadata_schema = StructType([
    StructField("batch_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("source_path", StringType(), True),
    StructField("record_count", LongType(), True),
    StructField("delta_table_version", LongType(), True),
    StructField("status", StringType(), True),
    StructField("rollback_flag", StringType(), True)  # 'Y' or 'N'
])

In [0]:
from pyspark.sql import SparkSession, Row
from datetime import datetime

def log_failure_to_audit(
    batch_id: str,
    source_path: str,
    record_count: int,
    delta_table_version: int = None
):


    metadata_row = Row(
        batch_id=batch_id,
        timestamp=datetime.now(),
        source_path=source_path,
        record_count=record_count,
        delta_table_version=delta_table_version,
        status="FAILED",
        rollback_flag="N"
    )

    metadata_df = spark.createDataFrame([metadata_row])
    metadata_df.write.format("delta").mode("append").save("abfss://gold@rmpyru.dfs.core.windows.net/Audit")

In [0]:
def log_success_to_audit(
    batch_id: str
):
    current_version = get_delta_version(target_path)

    from delta.tables import DeltaTable

    # Load the audit Delta table
    audit_path = "abfss://gold@rmpyru.dfs.core.windows.net/Audit"
    delta_table = DeltaTable.forPath(spark, audit_path)

    # Update status to SUCCESS for the given batch_id
    delta_table.update(
        condition = "batch_id = '{}'".format(batch_id),
        set = { "status": "'SUCCESS'", "rollback_flag": "'N'", "delta_table_version": f"{current_version}" }
    )

In [0]:
json = 'abfss://gold@rmpyru.dfs.core.windows.net'
df_zone = spark.read.format('json')\
                .option('inferSchema',True)\
                .option('multiLine',True)\
                .option('header',True)\
                .load(f'{json}/resturant_json_data.json')               

In [0]:
import uuid
from datetime import datetime
from pyspark.sql.types import StructType
from delta.tables import DeltaTable
import os

# Metadata
batch_id = str(uuid.uuid4())
timestamp = datetime.now()
source_path = "abfss://gold@rmpyru.dfs.core.windows.net/resturant_json_data.json"
record_count = df_zone.count()

# Target path
target_path = "abfss://gold@rmpyru.dfs.core.windows.net/zomato"

# Function to safely get Delta version
def get_delta_version(path):
    try:
        delta_table = DeltaTable.forPath(spark, path)
        history_df = delta_table.history()
        if history_df.count() > 0:
            return history_df.head(1)[0]['version']
        else:
            print("Delta table exists but has no version history.")
            return None
    except Exception as e:
        print(f"Delta table not found at {path}. Initializing...")
        return None

# Check and initialize if needed
current_version = get_delta_version(target_path)


In [0]:
metadata_df = spark.createDataFrame([(
    batch_id,
    timestamp,
    source_path,
    record_count,
    current_version,
    "STARTED",
    "N"
)], schema=metadata_schema)
metadata_df.write.format("delta").mode("append").save("abfss://gold@rmpyru.dfs.core.windows.net/Audit")


In [0]:
df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("restaurant name",col("restaurants.restaurant.name"))\
            .withColumn("cuisines",col("restaurants.restaurant.cuisines"))\
                .withColumn("ratings",col("restaurants.restaurant.user_rating.rating_text"))\
                    .withColumn("city",col("restaurants.restaurant.location.city"))\
                        .withColumn("establishment_types",explode_outer(col("restaurants.restaurant.establishment_types")))\
                            .drop("code","message","results_found","results_shown","results_start","status")\
                                .filter(col("city")=="Columbus")\
                                    .groupBy("ratings").count().alias("restaurant_ratings")
                                
                                

In [0]:
df_restaurant=df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("restaurant name",col("restaurants.restaurant.name"))\
            .withColumn("city",col("restaurants.restaurant.location.city"))\
                .drop("code","message","results_found","results_shown","results_start","status","restaurants")

In [0]:
df_restaurant_rating=df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("ratings",col("restaurants.restaurant.user_rating.rating_text"))\
            .drop("code","message","results_found","results_shown","results_start","status","restaurants")

In [0]:
df_restaurant_cuisines=df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("cuisines",col("restaurants.restaurant.cuisines"))\
            .drop("code","message","results_found","results_shown","results_start","status","restaurants")

In [0]:
#df_restaurant.display()
#df_restaurant_rating.cache()
#df_restaurant_cuisines.display()

df_final = df_restaurant.join(broadcast(df_restaurant_rating),df_restaurant["restaurant id"]==df_restaurant_rating["restaurant id"],how="left").join(df_restaurant_cuisines,df_restaurant["restaurant id"]==df_restaurant_cuisines["restaurant id"],how="inner").filter((col("restaurant name")!="") & (col("cuisines")=="")).select(df_restaurant["restaurant id"],"restaurant name","ratings","cuisines").groupBy("ratings").count()


In [0]:
#df_final = df_zone

In [0]:
#df_final.partitionBy("ratings")
#df_final.write.mode("overwrite").format("delta").save(f'{json}/zomato')
try:
    df_final.write.partitionBy("ratings").mode("overwrite").format("delta").save(f'{json}/zomato')
    log_success_to_audit(batch_id=batch_id)
except Exception as e:
    log_failure_to_audit(
        batch_id=batch_id,
        source_path=source_path,
        record_count=df_zone.count(),
        delta_table_version=current_version,
    )



In [0]:
def rollback_batch(batch_id: str):
    audit_df = spark.read.format("delta").load("abfss://gold@rmpyru.dfs.core.windows.net/Audit")
    rollback_info = audit_df.filter(f"batch_id = '{batch_id}'").orderBy("timestamp", ascending=False).limit(1).collect()[0]
    
    rollback_version = rollback_info["delta_table_version"]
    restored_df = spark.read.format("delta").option("versionAsOf", rollback_version).load("abfss://gold@rmpyru.dfs.core.windows.net/zomato")
    
    restored_df.write.format("delta").mode("overwrite").save("abfss://gold@rmpyru.dfs.core.windows.net/zomato")
    #print(f"Rollback to version {rollback_version} completed for batch {batch_id}")
    current_version = get_delta_version(target_path)

    updated_metadata_df = spark.createDataFrame([(
        batch_id,
        timestamp,
        source_path,
        record_count,
        current_version,
        "ROLLEDBACK",
        "Y"
    )], schema=metadata_schema)

    updated_metadata_df.write.format("delta").mode("append").save("abfss://gold@rmpyru.dfs.core.windows.net/Audit")

In [0]:
audit_df = spark.read.format("delta").load("abfss://gold@rmpyru.dfs.core.windows.net/Audit")

# Get latest failed batch (assuming you log status)
latest_failed_batch = (
    audit_df.filter("status = 'FAILED'")
            .orderBy("timestamp", ascending=False)
            .select("batch_id")
            .head(1)[0]["batch_id"] if audit_df.filter("status = 'FAILED'").count() > 0 else None
)

if latest_failed_batch is not None:    rollback_batch(latest_failed_batch)


In [0]:
%sql
select * from delta.`abfss://gold@rmpyru.dfs.core.windows.net/zomato`

In [0]:
%sql
select * from delta.`abfss://gold@rmpyru.dfs.core.windows.net/Audit`