In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

metadata_schema = StructType([
    StructField("batch_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("source_path", StringType(), True),
    StructField("record_count", LongType(), True),
    StructField("delta_table_version", LongType(), True),
    StructField("status", StringType(), True),
    StructField("rollback_flag", StringType(), True)  # 'Y' or 'N'
])

In [0]:
json = 'abfss://gold@rmpyru.dfs.core.windows.net'
df_zone = spark.read.format('json')\
                .option('inferSchema',True)\
                .option('multiLine',True)\
                .option('header',True)\
                .load(f'{json}/resturant_json_data.json')
df_zone.display()                

In [0]:
import uuid
from datetime import datetime

batch_id = str(uuid.uuid4())
timestamp = datetime.now()
source_path = "abfss://gold@rmpyru.dfs.core.windows.net/resturant_json_data.json"
record_count = df.count()

# Get current Delta version
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, "abfss://gold@rmpyru.dfs.core.windows.net/zomato")
current_version = delta_table.history().head(1)[0]['version']

In [0]:
metadata_df = spark.createDataFrame([(
    batch_id,
    timestamp,
    source_path,
    record_count,
    current_version,
    "STARTED",
    "N"
)], schema=metadata_schema)

metadata_df.write.format("delta").mode("append").save("abfss://gold@rmpyru.dfs.core.windows.net/zomato/Audit")

In [0]:
df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("restaurant name",col("restaurants.restaurant.name"))\
            .withColumn("cuisines",col("restaurants.restaurant.cuisines"))\
                .withColumn("ratings",col("restaurants.restaurant.user_rating.rating_text"))\
                    .withColumn("city",col("restaurants.restaurant.location.city"))\
                        .withColumn("establishment_types",explode_outer(col("restaurants.restaurant.establishment_types")))\
                            .drop("code","message","results_found","results_shown","results_start","status")\
                                .filter(col("city")=="Columbus")\
                                    .groupBy("ratings").count().alias("restaurant_ratings")\
                                        .display()
                                
                                

In [0]:
df_restaurant=df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("restaurant name",col("restaurants.restaurant.name"))\
            .withColumn("city",col("restaurants.restaurant.location.city"))\
                .drop("code","message","results_found","results_shown","results_start","status","restaurants")

In [0]:
df_restaurant_rating=df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("ratings",col("restaurants.restaurant.user_rating.rating_text"))\
            .drop("code","message","results_found","results_shown","results_start","status","restaurants")

In [0]:
df_restaurant_cuisines=df_zone.withColumn("restaurants",explode("restaurants"))\
    .withColumn("restaurant id",col("restaurants.restaurant.id"))\
        .withColumn("cuisines",col("restaurants.restaurant.cuisines"))\
            .drop("code","message","results_found","results_shown","results_start","status","restaurants")

In [0]:
#df_restaurant.display()
#df_restaurant_rating.cache()
#df_restaurant_cuisines.display()

df_final = df_restaurant.join(broadcast(df_restaurant_rating),df_restaurant["restaurant id"]==df_restaurant_rating["restaurant id"],how="left").join(df_restaurant_cuisines,df_restaurant["restaurant id"]==df_restaurant_cuisines["restaurant id"],how="inner").filter((col("restaurant name")!="") & (col("cuisines")=="")).select(df_restaurant["restaurant id"],"restaurant name","ratings","cuisines").groupBy("ratings").count()


In [0]:
#df_final.partitionBy("ratings")
#df_final.write.mode("overwrite").format("delta").save(f'{json}/zomato')
df_final.write.partitionBy("ratings").mode("overwrite").format("delta").save(f'{json}/zomato')

In [0]:
%sql
select * from delta.`abfss://gold@rmpyru.dfs.core.windows.net/zomato` where ratings = 'Good'

In [0]:
updated_metadata_df = spark.createDataFrame([(
    batch_id,
    timestamp,
    source_path,
    record_count,
    current_version + 1,  # new version after write
    "SUCCESS",
    "N"
)], schema=metadata_schema)

updated_metadata_df.write.format("delta").mode("append").save("abfss://gold@rmpyru.dfs.core.windows.net/zomato/Audit")