### Create a simple streaming query with trigger intervals.

In [0]:

# read csv file
df = spark.read.format('csv').option('inferSchema',True).option('header',True)\
    .load('dbfs:/FileStore/tables/BigMart_Sales.csv')

In [0]:
# convert csv to delta table
df.write.format('delta').mode('append').save('/FileStore/tables/bigmart_delta_s')

In [0]:
#Read streaming data from Delta
df_stream = spark.readStream \
    .format("delta") \
    .load("dbfs:/FileStore/tables/bigmart_delta_s")

In [0]:
# Optional transformation - count items by Item_Type
item_counts = df_stream.groupBy("Item_Type").count()


In [0]:
# Write to console every 20 seconds using a trigger interval
query = item_counts.writeStream \
    .outputMode("complete") \
    .format("console") \
    .trigger(processingTime="20 seconds") \
    .option("truncate", False) \
    .start()

In [0]:
query.awaitTermination()


### Build a sample structured streaming pipeline.




In [0]:
delta_source_path ="dbfs:/FileStore/tables/bigmart_delta_s"

In [0]:
# Read streaming data from Delta
df_stream = spark.readStream \
    .format("delta") \
    .load(delta_source_path)

In [0]:
df_stream.printSchema()


root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [0]:
from pyspark.sql.functions import col, sum


In [0]:
df_stream = df_stream.withColumn("Item_Outlet_Sales", col("Item_Outlet_Sales").cast("double"))


In [0]:
aggregated_stream = df_stream.groupBy("Item_Type") \
    .agg(sum("Item_Outlet_Sales").alias("Total_Sales"))


In [0]:
#  Write the result to the console
query = aggregated_stream.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", False) \
    .start()



In [0]:
query.awaitTermination()
