## BoilerPlate Task 1 : 

In [None]:
# Import required libraries
from pyspark.sql import SparkSession

# Step 1: Initialize the SparkSession
# The SparkSession is the entry point to Spark functionality.
spark = ( SparkSession.builder 
    .appName("QuickCommerce Streaming Pipeline") 
    # Set a descriptive application name
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1")  
    # Include Kafka package
    .getOrCreate())

# Step 2: Read from the Kafka topic
# Configure the Kafka connection and subscribe to the desired topic.
streaming_df = (spark.readStream 
    .format("kafka")  
# Specify Kafka as the data source
    .option("kafka.bootstrap.servers", "localhost:9092")  
 # Kafka server address
    .option("subscribe", "ecommerce_topic") 
 # Topic to read data from
    .load())

# Step 3: Define the checkpoint directory
# The checkpoint directory is used to ensure fault tolerance.
checkpoint_dir = "/tmp/quickcommerce_streaming_checkpoint"

# Step 4: Write the streaming data to the console
# Display the incoming data in the console for testing.
query = (streaming_df.writeStream \
    .format("console") 
 # Output the data to the console
    .outputMode("append") 
  # Display only the new data that arrives
    .option("checkpointLocation", checkpoint_dir) 
  # Enable checkpointing for fault tolerance
    .start())

# Step 5: Print the schema of the streaming DataFrame
# Display the structure of the incoming data for reference.
streaming_df.printSchema()

# Keep the application running until manually terminated.
query.awaitTermination()
