In [None]:
# Define the input path for delivery files
delivery_path = "abfss://ipl-incremental-data@incrementalipldata.dfs.core.windows.net/row/delivery/"

In [None]:
# Define the schema for delivery data (optional but recommended)
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import current_timestamp


delivery_schema = StructType([
    StructField("match_id", IntegerType(), True),
    StructField("inning", IntegerType(), True),
    StructField("batting_team", StringType(), True),
    StructField("bowling_team", StringType(), True),
    StructField("over", IntegerType(), True),
    StructField("ball", IntegerType(), True),
    StructField("batsman", StringType(), True),
    StructField("non_striker", StringType(), True),
    StructField("bowler", StringType(), True),
    StructField("is_super_over", IntegerType(), True),
    StructField("wide_runs", IntegerType(), True),
    StructField("bye_runs", IntegerType(), True),
    StructField("legbye_runs", IntegerType(), True),
    StructField("noball_runs", IntegerType(), True),
    StructField("penalty_runs", IntegerType(), True),
    StructField("batsman_runs", IntegerType(), True),
    StructField("extra_runs", IntegerType(), True),
    StructField("total_runs", IntegerType(), True),
    StructField("player_dismissed", StringType(), True),
    StructField("dismissal_kind", StringType(), True),
    StructField("fielder", StringType(), True)
])



In [None]:


# Autoloader to read new files in the /delivery/ folder
df_delivery = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", "/mnt/delta/delivery/schema")  # Keep track of the schema
    .option("header", "true")
    .schema(delivery_schema)  # Apply schema
    .load(delivery_path)
)



In [None]:
# Add ingestion timestamp to the data
df_delivery = df_delivery.withColumn("ingestion_ts", current_timestamp())


In [None]:

# Write the data into the `delivery_loading` Delta table incrementally
(df_delivery.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/mnt/delta/delivery/checkpoints")  # Keep track of the processed files
    .table("bronze.delivery_loading")
)

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7f6595e48aa0>