<a href="https://colab.research.google.com/github/visshal2301/AdvanceSpark_GoogleColab/blob/main/Shubham_3_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
from pyspark.sql import SparkSession

In [41]:
spark = SparkSession.builder \
       .master("local[*]") \
       .appName("Colab") \
       .config('spark.ui.port', '4050') \
       .getOrCreate()
spark

In [42]:
anscombe_df = spark.read.format("json").load("/content/sample_data/anscombe.json")
anscombe_df.show(truncate=False)

+------+----+-----+---------------+
|Series|X   |Y    |_corrupt_record|
+------+----+-----+---------------+
|NULL  |NULL|NULL |[              |
|I     |10.0|8.04 |NULL           |
|I     |8.0 |6.95 |NULL           |
|I     |13.0|7.58 |NULL           |
|I     |9.0 |8.81 |NULL           |
|I     |11.0|8.33 |NULL           |
|I     |14.0|9.96 |NULL           |
|I     |6.0 |7.24 |NULL           |
|I     |4.0 |4.26 |NULL           |
|I     |12.0|10.84|NULL           |
|I     |7.0 |4.81 |NULL           |
|I     |5.0 |5.68 |NULL           |
|II    |10.0|9.14 |NULL           |
|II    |8.0 |8.14 |NULL           |
|II    |13.0|8.74 |NULL           |
|II    |9.0 |8.77 |NULL           |
|II    |11.0|9.26 |NULL           |
|II    |14.0|8.1  |NULL           |
|II    |6.0 |6.13 |NULL           |
|II    |4.0 |3.1  |NULL           |
+------+----+-----+---------------+
only showing top 20 rows


In [43]:
anscombe_df_cleaned = anscombe_df.filter(anscombe_df._corrupt_record.isNull())
anscombe_df_cleaned.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [44]:
anscombe_df_cleaned.show(truncate=False)

+------+----+-----+---------------+
|Series|X   |Y    |_corrupt_record|
+------+----+-----+---------------+
|I     |10.0|8.04 |NULL           |
|I     |8.0 |6.95 |NULL           |
|I     |13.0|7.58 |NULL           |
|I     |9.0 |8.81 |NULL           |
|I     |11.0|8.33 |NULL           |
|I     |14.0|9.96 |NULL           |
|I     |6.0 |7.24 |NULL           |
|I     |4.0 |4.26 |NULL           |
|I     |12.0|10.84|NULL           |
|I     |7.0 |4.81 |NULL           |
|I     |5.0 |5.68 |NULL           |
|II    |10.0|9.14 |NULL           |
|II    |8.0 |8.14 |NULL           |
|II    |13.0|8.74 |NULL           |
|II    |9.0 |8.77 |NULL           |
|II    |11.0|9.26 |NULL           |
|II    |14.0|8.1  |NULL           |
|II    |6.0 |6.13 |NULL           |
|II    |4.0 |3.1  |NULL           |
|II    |12.0|9.13 |NULL           |
+------+----+-----+---------------+
only showing top 20 rows


In [45]:
anscombe_df.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [46]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType



In [47]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])


In [48]:
df = spark.createDataFrame([
    ('{"name":"Alice","age":30,"city":"Pune"}',),
    ('{"name":"Bob","age":25,"city":"Mumbai"}',),
    ('{"name":"Charlie","age":35,"city":"Delhi"}',)
], ["json_str"])


In [49]:
df.printSchema()

root
 |-- json_str: string (nullable = true)



In [50]:
df_parsed = df.withColumn("parsed", from_json("json_str", schema))



In [51]:
df_parsed.select("parsed.*").show()

+-------+---+------+
|   name|age|  city|
+-------+---+------+
|  Alice| 30|  Pune|
|    Bob| 25|Mumbai|
|Charlie| 35| Delhi|
+-------+---+------+



In [52]:
# Array List
from pyspark.sql.types import ArrayType

schema = ArrayType(StructType([
    StructField("id", IntegerType(), True),
    StructField("value", StringType(), True)
]))


In [53]:
df = spark.createDataFrame([
    ('[{"id":1,"value":"A"},{"id":2,"value":"B"}]',),
    ('[{"id":3,"value":"X"},{"id":4,"value":"Y"}]',)
], ["json_str"])


In [54]:
df_parsed = df.withColumn("parsed", from_json("json_str", schema))
df_parsed.show(truncate=False)


+-------------------------------------------+----------------+
|json_str                                   |parsed          |
+-------------------------------------------+----------------+
|[{"id":1,"value":"A"},{"id":2,"value":"B"}]|[{1, A}, {2, B}]|
|[{"id":3,"value":"X"},{"id":4,"value":"Y"}]|[{3, X}, {4, Y}]|
+-------------------------------------------+----------------+



In [55]:
# Explode array into rows
from pyspark.sql.functions import explode
df_exploded = df_parsed.withColumn("element", explode("parsed"))
df_exploded.show(truncate=False)

+-------------------------------------------+----------------+-------+
|json_str                                   |parsed          |element|
+-------------------------------------------+----------------+-------+
|[{"id":1,"value":"A"},{"id":2,"value":"B"}]|[{1, A}, {2, B}]|{1, A} |
|[{"id":1,"value":"A"},{"id":2,"value":"B"}]|[{1, A}, {2, B}]|{2, B} |
|[{"id":3,"value":"X"},{"id":4,"value":"Y"}]|[{3, X}, {4, Y}]|{3, X} |
|[{"id":3,"value":"X"},{"id":4,"value":"Y"}]|[{3, X}, {4, Y}]|{4, Y} |
+-------------------------------------------+----------------+-------+



In [56]:
# Select fields from struct
df_final = df_exploded.select("element.id", "element.value")
df_final.show()


+---+-----+
| id|value|
+---+-----+
|  1|    A|
|  2|    B|
|  3|    X|
|  4|    Y|
+---+-----+



In [57]:
#Spark UDF

import json
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define schema for clarity
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("value", StringType(), True)
])


In [58]:
def parse_json_array(json_str):
    try:
        data = json.loads(json_str)
        return [(item.get("id"), item.get("value")) for item in data]
    except Exception:
        return None



In [59]:
parse_json_udf = udf(parse_json_array, ArrayType(StructType([StructField("id", IntegerType(), True), StructField("value", StringType(), True)])))

# Sample DataFrame
df = spark.createDataFrame([
    ('[{"id":1,"value":"A"},{"id":2,"value":"B"}]',),
    ('[{"id":3,"value":"X"},{"id":4,"value":"Y"}]',)
], ["json_str"])

# Apply UDF
df_parsed = df.withColumn("parsed", parse_json_udf("json_str"))
df_parsed.show(truncate=False)

+-------------------------------------------+----------------+
|json_str                                   |parsed          |
+-------------------------------------------+----------------+
|[{"id":1,"value":"A"},{"id":2,"value":"B"}]|[{1, A}, {2, B}]|
|[{"id":3,"value":"X"},{"id":4,"value":"Y"}]|[{3, X}, {4, Y}]|
+-------------------------------------------+----------------+



In [60]:
# Flatten array into rows
df_exploded = df_parsed.withColumn("element", explode("parsed"))

# Select struct fields
df_final = df_exploded.select("element.id", "element.value")
df_final.show()


+---+-----+
| id|value|
+---+-----+
|  1|    A|
|  2|    B|
|  3|    X|
|  4|    Y|
+---+-----+



In [61]:
spark.udf.register("parse_json_array", parse_json_array, ArrayType(StructType([StructField("id", IntegerType(), True), StructField("value", StringType(), True)])))

In [62]:
df = spark.createDataFrame([
    ('[{"id":1,"value":"A"},{"id":2,"value":"B"}]',),
    ('[{"id":3,"value":"X"},{"id":4,"value":"Y"}]',)
], ["json_str"])

df.createOrReplaceTempView("json_table")

In [63]:
spark.sql("""
SELECT parse_json_array(json_str) AS parsed
FROM json_table
""").show(truncate=False)

+----------------+
|parsed          |
+----------------+
|[{1, A}, {2, B}]|
|[{3, X}, {4, Y}]|
+----------------+



In [65]:
spark.sql("""
SELECT element.id, element.value
FROM (
    SELECT EXPLODE(parse_json_array(json_str)) AS element
    FROM json_table
)
""").show()

+---+-----+
| id|value|
+---+-----+
|  1|    A|
|  2|    B|
|  3|    X|
|  4|    Y|
+---+-----+

