In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("read_from json")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .master("local[*]")
    .getOrCreate()
)

In [2]:
spark

In [3]:
_schema = "customerId string, data struct<devices array<struct<deviceId string,measure string,temperature string>>>,  eventId string, eventOffset long, eventPublisher string, eventTime string"

In [4]:
### Batch processing logic

#file_data_df = spark.read.format("json").option("schema", _schema).option("path", "inputs/file1.json").load()

### Streaming logic and configuration

spark.conf.set("spark.sql.streaming.schemaInference", True)

streaming_df = (
   spark
    .readStream
    .option("cleanSource", "archive")
    .option("sourceArchiveDir", "archive_dir")
    .option("maxFilesPerTrigger", 1)
    .format("json")
    .load("./inputs")
)

In [5]:
# To allow automatic schemaInference while reading


In [6]:
streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [7]:
# file_data_df.show(truncate=False)

In [8]:
from pyspark.sql.functions import explode, from_json, col

exploded_devices_df = (
    streaming_df
    .withColumn("devices", explode(streaming_df.data.devices))
)

In [9]:
# exploded_devices_df.show()

In [10]:
exploded_devices_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [11]:
final_df = (
    exploded_devices_df
    .drop("data")
    .withColumn("deviceId", col("devices.deviceId"))
    .withColumn("measure", col("devices.measure"))
    .withColumn("status", col("devices.status"))
    .withColumn("temperature", col("devices.temperature"))
    .drop("devices")
    
)

In [12]:
final_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [13]:
# final_df.show()

In [None]:
# Write the output to console sink to check the output

(final_df
 .writeStream
 .format("csv")
 .outputMode("append")
 .option("path", "output/device_data.csv")
 .option("checkpointLocation", "checkpoint_dir")
 .start()
 .awaitTermination())