In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder \
        .appName("Debugging Worker Errors") \
        .master("local[*]") \
        .getOrCreate()

In [35]:
# -- 1
# Get the DF for the CSV file
data = './data/employees_1.json'

# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", DoubleType(), True)
])

# Read the JSON file with the schema
# df = spark.read.option("multiline", "true").json(data)
df = spark.read.schema(schema).option("multiline", "true").json(data)

df.printSchema()

# Show the data
df.show()


root
 |-- department: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+-----------+---+-------+---------+
| department| id|   name|   salary|
+-----------+---+-------+---------+
|Engineering|  1|  Alice| 120000.5|
|         HR|  2|    Bob|  90000.0|
|    Finance|  3|Charlie|110000.75|
+-----------+---+-------+---------+



In [39]:
# -- 2
# Define the schema with nullable fields
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", DoubleType(), True)
])

# Read the JSON file
df = spark.read.schema(schema).option("multiline", "true").json("data/employees_1.json")

# Show the rows where name is null
df.filter(df["name"].isNotNull()).show()

+---+-------+----------+---------+
| id|   name|department|   salary|
+---+-------+----------+---------+
|  2|    Bob|        HR|  90000.0|
|  3|Charlie|   Finance|110000.75|
+---+-------+----------+---------+



In [40]:
# -- 3
# Define the schema for nested JSON
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("details", StructType([
        StructField("age", IntegerType(), True),
        StructField("city", StringType(), True)
    ]))
])

# Read the JSON file
df = spark.read.schema(schema).option("multiline", "true").json("data/employees_2.json")

# Show the data
df.show(truncate=False)

# Extract the 'city' from 'details'
df.select("id", "details.city").show()

+---+-------+-----------------+
|id |name   |details          |
+---+-------+-----------------+
|1  |Alice  |{30, New York}   |
|2  |Bob    |{40, Los Angeles}|
|3  |Charlie|{35, Chicago}    |
+---+-------+-----------------+

+---+-----------+
| id|       city|
+---+-----------+
|  1|   New York|
|  2|Los Angeles|
|  3|    Chicago|
+---+-----------+



In [47]:
# -- 4
from pyspark.sql.functions import explode
from pyspark.sql.types import ArrayType

# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("projects", ArrayType(StringType()), True),
    StructField("details", StructType([
        StructField("age", IntegerType(), True),
        StructField("city", StringType(), True)
    ]))
])

# Read the JSON file
df = spark.read.schema(schema).option("multiline", "true").json("data/employees_3.json")

# Explode the array
exploded_df = df.select("id", "name", explode("projects"), "details.age", "details.city")
exploded_df.show()

+---+-------+------------------+---+-----------+
| id|   name|               col|age|       city|
+---+-------+------------------+---+-----------+
|  1|  Alice|     Project Alpha| 30|   New York|
|  1|  Alice|      Project Beta| 30|   New York|
|  2|    Bob|Employee Relations| 40|Los Angeles|
|  2|    Bob| Recruitment Drive| 40|Los Angeles|
|  3|Charlie|  Quarterly Budget| 35|    Chicago|
|  3|Charlie|     Annual Report| 35|    Chicago|
+---+-------+------------------+---+-----------+

