In [0]:
resturant_json_data = spark.read.format("json")\
    .option("multiline","true")\
    .option("inferSchema","true")\
    .load("/FileStore/tables/resturant_json_data.json")

In [0]:
resturant_json_data.show()

+----+-------+--------------------+-------------+-------------+-------------+------+
|code|message|         restaurants|results_found|results_shown|results_start|status|
+----+-------+--------------------+-------------+-------------+-------------+------+
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|[{{{17066603}, b9...|         6835|           20|            1|  null|
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|[{{{17093124}, b9...|         8680|           20|            1|  null|
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|[{{{17580142}, b9...|          943|           20|            1|  null|
|null|   null|                  []|            0|            0|  

In [0]:
resturant_json_data.printSchema()

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- restaurants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- restaurant: struct (nullable = true)
 |    |    |    |-- R: struct (nullable = true)
 |    |    |    |    |-- res_id: long (nullable = true)
 |    |    |    |-- apikey: string (nullable = true)
 |    |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |    |-- cuisines: string (nullable = true)
 |    |    |    |-- currency: string (nullable = true)
 |    |    |    |-- deeplink: string (nullable = true)
 |    |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- events_url: string (nullable = true)
 |    |    |    |-- featured_image: string (nullable = true)
 |    |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |    |-- has_table_booking: long (nullable = true)
 |    |    |    |-- i

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
resturant_json_data.select("*",explode("restaurants").alias("new_restaurants"))\
    .drop("restaurants").printSchema() # for array we use explode 

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- new_restaurants: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |-- has_tab

In [0]:
resturant_json_data.select("*",explode("restaurants").alias("new_restaurants"))\
    .drop("restaurants")\
        .select("new_restaurants.restaurant.R.res_id").show() # for struct type we can use . (dot) -> new_restaurants.restaurant.R.res_id

+--------+
|  res_id|
+--------+
|17066603|
|17059541|
|17064405|
|17057797|
|17057591|
|17064266|
|17060516|
|17060320|
|17059060|
|17059012|
|17060869|
|17061231|
|17058534|
|17057925|
|17064031|
|17061237|
|17061253|
|17061296|
|17061205|
|17057397|
+--------+
only showing top 20 rows



In [0]:
resturant_json_data.select("*",explode("restaurants").alias("new_restaurants"))\
    .drop("restaurants")\
        .select("*","new_restaurants.restaurant.R.res_id").printSchema() 

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- new_restaurants: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |-- has_tab

if in schema we found struct then use .

if in schema we found array then explode

if in data we found any null value then use explode_outer

In [0]:
resturant_json_data.select("*",explode("restaurants").alias("new_restaurants"))\
    .drop("restaurants")\
        .select("*","new_restaurants.restaurant.R.res_id",
                explode("new_restaurants.restaurant.establishment_types").alias("establishment_types_new"),
                "new_restaurants.restaurant.name").drop("new_restaurants").printSchema() # explode establishment_types

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- res_id: long (nullable = true)
 |-- establishment_types_new: string (nullable = true)
 |-- name: string (nullable = true)



In [0]:
resturant_json_data.select("*",explode("restaurants").alias("new_restaurants"))\
    .drop("restaurants")\
        .select("*","new_restaurants.restaurant.R.res_id",
                explode("new_restaurants.restaurant.establishment_types").alias("establishment_types_new"),
                "new_restaurants.restaurant.name").drop("new_restaurants").show() # explode used here and we didnot get any data because we have any null values in element column thats why it discart all data so we used explode_outer

+----+-------+-------------+-------------+-------------+------+------+-----------------------+----+
|code|message|results_found|results_shown|results_start|status|res_id|establishment_types_new|name|
+----+-------+-------------+-------------+-------------+------+------+-----------------------+----+
+----+-------+-------------+-------------+-------------+------+------+-----------------------+----+



In [0]:
resturant_json_data.select("*",explode("restaurants").alias("new_restaurants"))\
    .drop("restaurants")\
        .select("*","new_restaurants.restaurant.R.res_id",
                explode_outer("new_restaurants.restaurant.establishment_types").alias("establishment_types_new"),
                "new_restaurants.restaurant.name").drop("new_restaurants","code","message","results_found","results_start","status","result_shown").show(truncate=False) # explode_outer used here because we have any null values in element column thats why we use explode_outer

+-------------+--------+-----------------------+------------------------------------+
|results_shown|res_id  |establishment_types_new|name                                |
+-------------+--------+-----------------------+------------------------------------+
|20           |17066603|null                   |The Coop                            |
|20           |17059541|null                   |Maggiano's Little Italy             |
|20           |17064405|null                   |Tako Cheena by Pom Pom              |
|20           |17057797|null                   |Bosphorous Turkish Cuisine          |
|20           |17057591|null                   |Bahama Breeze Island Grille         |
|20           |17064266|null                   |Hawkers Asian Street Fare           |
|20           |17060516|null                   |Seasons 52 Fresh Grill              |
|20           |17060320|null                   |Raglan Road Irish Pub and Restaurant|
|20           |17059060|null                   |Hillst