# Flatten_Zomato_Restaurants_data_JSON

In [2]:
from pyspark.sql import SparkSession,DataFrame

# Create spark session
spark : SparkSession = SparkSession.builder.getOrCreate()

## 🍽️ Scenario:

<p>
You are building a feature to analyze restaurant popularity and delivery availability by location.

The marketing team wants to identify:

1. Top-rated restaurants (by user_rating.aggregate_rating) in cities like Delhi, Mumbai, and Bangalore.
2. Which areas (localities) have restaurants that are currently delivering now.
3. A breakdown of average cost for two, grouped by city and cuisine.
4. A filterable table that shows:

    - Restaurant Name
    - City
    - Cuisine
    - Delivery Status (is_delivering_now)
    - Aggregate Rating
    - Average Cost
    - Votes
    - Menu URL

To achieve this, you'll need to flatten multiple nested fields (e.g., restaurant.location.city, restaurant.user_rating.aggregate_rating, etc.) and possibly explode arrays like restaurants, and handle nested structs cleanly.
</p>

In [57]:
# imports
import os
from pyspark.sql.functions import explode,explode_outer,collect_list,col,split,avg,trim

pwd = os.getcwd()
filepath = pwd+'//datasets//zomato_restaurants_data.json'
print(filepath)

c:\Users\vikas\OneDrive\Desktop\Projects\code-odyssey\pyspark//datasets//zomato_restaurants_data.json


In [12]:
# read json from ./pyspark/datasets/zomato_restaurants_data.json
restaurants_df = spark.read.format('json').option('multiline', 'True').option('inferschema','True').load(filepath)

In [58]:
restaurant_flat = restaurants_df.drop('code','message').withColumn('restaurants', explode(restaurants_df.restaurants))

In [76]:
df2 = restaurant_flat.select(
                    col('restaurants.restaurant.name').alias('Restaurant_Name'),
                    col('restaurants.restaurant.location.city').alias('City') ,
                    split(col('restaurants.restaurant.cuisines'),',').alias('Cuisines'),
                    col('restaurants.restaurant.is_delivering_now').alias('Delivery_Status'),
                    col('restaurants.restaurant.user_rating.aggregate_rating').alias('Aggregate_Rating'),
                    col('restaurants.restaurant.average_cost_for_two').alias('Average_Cost'),
                    col('restaurants.restaurant.user_rating.votes').alias('Votes'),
                    col('restaurants.restaurant.menu_url').alias('Menu_URL')) \
            .filter(col('restaurants.restaurant.is_delivering_now') == 0)

In [77]:
df2 = df2.withColumn("Cuisine", (explode("Cuisines"))).drop('Cuisines')
df2 = df2.withColumn("Cuisine",trim(col('Cuisine')))

In [78]:
df2.show()

+--------------------+-------+---------------+----------------+------------+-----+--------------------+--------------+
|     Restaurant_Name|   City|Delivery_Status|Aggregate_Rating|Average_Cost|Votes|            Menu_URL|       Cuisine|
+--------------------+-------+---------------+----------------+------------+-----+--------------------+--------------+
|            The Coop|Orlando|              0|             3.6|          25|  432|https://www.zomat...|      Southern|
|            The Coop|Orlando|              0|             3.6|          25|  432|https://www.zomat...|         Cajun|
|            The Coop|Orlando|              0|             3.6|          25|  432|https://www.zomat...|     Soul Food|
|Maggiano's Little...|Orlando|              0|             4.4|          50|  886|https://www.zomat...|       Italian|
|Tako Cheena by Po...|Orlando|              0|             4.4|          10|  570|https://www.zomat...|         Asian|
|Tako Cheena by Po...|Orlando|              0|  

In [79]:
spark.stop()