In [10]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Streaming Process Files") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .master("local[*]") 
    .getOrCreate()
)

spark

In [11]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = (
    spark.readStream
    .format("json")
    .option("cleanSource","archive")
    .option("sourceArchiveDir","archive_dir")
    .option("maxFilesPerTrigger",1)
    .load("data/input/device_files/")
)

In [12]:
# To the schema of the data, place a sample json file and change readStream to read 
streaming_df.printSchema()
#streaming_df.show(truncate=False)

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [13]:
# Lets explode the data as devices contains list/array of device reading
from pyspark.sql.functions import explode
explode_df = streaming_df.withColumn("data_exp",explode("data.devices"))

#explode_df.show()
explode_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- data_exp: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [22]:
# Flatten the exploded df
from pyspark.sql.functions import col

flatten_df = (
    explode_df
    .drop("data")
    .withColumn("deviceId",col("data_exp.deviceId"))
    .withColumn("measure",col("data_exp.measure"))
    .withColumn("status",col("data_exp.status"))
    .withColumn("temperature",col("data_exp.temperature"))
    .drop("data_exp")
)




In [23]:
#flatten_df.show()
flatten_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [None]:
# write the output to console sink to check out output

(
    flatten_df
    .writeStream
    .format("csv")
    .option("path","data/output/device_data.csv")
    .option("checkpointLocation","checkpoint_dir")
    .outputMode("append")
    .start()
    .awaitTermination()

)