# Structured Streaming and Transformations on Streams

In [None]:
dbutils.fs.cp("file:/Workspace/Shared/transactions.csv", "dbfs:/FileStore/streaming/input/transactions.csv")

True

Task 1: Ingest Streaming Data from CSV Files

In [None]:
static_df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/streaming/input/transactions.csv")
schema = static_df.schema

streaming_df = spark.readStream.format("csv").option("header", "true").schema(schema).load("dbfs:/FileStore/streaming/input/")

query = streaming_df.writeStream.format("console").start()

Task 2: Stream Transformations

In [None]:

transformed_df = streaming_df.withColumn("TotalAmount", streaming_df["Quantity"] * streaming_df["Price"]).filter(streaming_df["Quantity"] > 1)

query = transformed_df.writeStream.format("memory").queryName("transformed_stream").start()


Task 3: Aggregations on Streaming Data

In [None]:
from pyspark.sql.functions import col, sum
#Group the data by ProductID and calculate the total sales for each product
aggregated_df = streaming_df.groupBy("ProductID").agg(sum(col("Quantity") * col("Price")).alias("TotalSales"))
query = aggregated_df.writeStream.format("console").outputMode("update").start()

Task 4: Writing Streaming Data to File Sinks

In [None]:
query = transformed_df.writeStream.format("parquet").option("path", "/dbfs/FileStore/parquet") \
                                   .option("checkpointLocation", "/dbfs/FileStore/checkpoint") \
                                   .start()


Task 5: Handling Late Data using Watermarks

In [None]:
from pyspark.sql.functions import col, to_timestamp

streaming_df = streaming_df.withColumn("TransactionDate", to_timestamp(col("TransactionDate")))

watermarked_df = streaming_df.withWatermark("TransactionDate", "1 day")

watermarked_query = watermarked_df.writeStream.format("console").start()

Task 6: Streaming from Multiple Sources

In [None]:
# Stream 1: Incoming transaction data (CSV)
transactions_stream = spark.readStream.format("csv") \
    .option("header", "true") \
    .option("basePath", "dbfs:/FileStore/streaming/input/") \
    .schema("TransactionID STRING, TransactionDate DATE, ProductID STRING, Quantity INT, Price DOUBLE") \
    .load("dbfs:/FileStore/streaming/input/")

# Stream 2: Product information (JSON)
products_stream = spark.readStream.format("json") \
    .option("basePath", "dbfs:/FileStore/streaming/input/") \
    .schema("ProductID STRING, ProductName STRING, Category STRING") \
    .load("dbfs:/FileStore/streaming/input/")

# Join both streams on ProductID
joined_stream = transactions_stream.join(products_stream, "ProductID")

# Write the joined stream to the console to visualize results
query = joined_stream.writeStream \
    .format("console") \
    .outputMode("append") \
    .start()

Task 7

In [None]:
# Stop streaming query
query.stop()

# Restart the query
query = streaming_df.writeStream.outputMode("append").format("console").start()
query.awaitTermination()
