In [0]:
from pyspark.sql.types import *

orders_schema = StructType([
    StructField("order_id", LongType(), True),
    StructField("customer_id", LongType(), True),
    StructField("customer_fname", StringType(), True),
    StructField("customer_lname", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("customer_pincode", LongType(), True),
    StructField("order_items", ArrayType(StructType([
        StructField("order_item_id", LongType(), True),
        StructField("order_item_product_id", LongType(), True),
        StructField("order_item_product_price", DecimalType(10, 2), True),
        StructField("order_item_quantity", LongType(), True),
        StructField("order_item_subtotal", DecimalType(10, 2), True)
    ])), True)
])


Creating a directory

In [0]:
dbutils.fs.mkdirs("dbfs:/FileStore/processed/completemode/")

Out[12]: True

In [0]:
orders_df = spark \
    .readStream \
        .format("json") \
            .schema(orders_schema) \
                .option("path","dbfs:/FileStore/merge.json") \
                    .load()

In [0]:
from pyspark.sql.types import *

schema = StructType([
    StructField("customer_id", LongType(), True),
    StructField("orders_placed", LongType(), False),
    StructField("product_purchased", LongType(), False),
    StructField("amount_spent", DecimalType(20, 2), True)
])


In [0]:
aggregated_orders = spark \
    .readStream \
        .format("csv") \
            .schema(schema) \
                .option("path","dbfs:/FileStore/processed/aggregated_orders.csv") \
                    .load()





In [0]:
streaming_query = aggregated_orders \
    .writeStream \
        .format("delta") \
            .outputMode("append") \
                    .option("checkPointLocation","checkpointdir103") \
                        .toTable("orders_result103")

In [0]:
spark.sql("select * from orders_result103").show()

+-----------+-------------+-----------------+------------+
|customer_id|orders_placed|product_purchased|amount_spent|
+-----------+-------------+-----------------+------------+
|       null|         null|             null|        null|
|       3764|            7|                7|     5397.59|
|      11745|            3|                3|     1023.32|
|       9945|            6|                6|     2081.75|
|      11619|            7|                7|     2625.99|
|       9968|            6|                6|     3814.46|
|       5385|            2|                2|      415.18|
|       1950|            9|                9|     7121.30|
|       9978|            7|                7|     4304.14|
|        964|            7|                7|     5140.01|
|      10959|            7|                7|     2295.49|
|       9715|            6|                6|     3770.84|
|        474|            8|                8|     2903.38|
|       7225|            5|                5|     1792.0

In [0]:
streaming_query.explain()

== Physical Plan ==
FileScan csv [customer_id#1843L,orders_placed#1844L,product_purchased#1845L,amount_spent#1846] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/FileStore/processed/aggregated_orders.csv/part-00000-tid-4176033..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<customer_id:bigint,orders_placed:bigint,product_purchased:bigint,amount_spent:decimal(20,2)>


