In [0]:
%run "./reader_factory/"

In [0]:
# dbutils.fs.rm("abfss://bronze@ecommerceproject1.dfs.core.windows.net/orders",recurse=True)

In [0]:
# dbutils.fs.rm("abfss://stream-data@ecommerceproject1.dfs.core.windows.net/",recurse=True)

In [0]:
dbutils.fs.cp("dbfs:/FileStore/tables/stream_write/order_year=2016","abfss://stream-data@ecommerceproject1.dfs.core.windows.net/order_year=2016",recurse=True)

In [0]:
dbutils.fs.cp("dbfs:/FileStore/tables/stream_write/order_year=2017","abfss://stream-data@ecommerceproject1.dfs.core.windows.net/order_year=2017",recurse=True)

In [0]:
dbutils.fs.cp("dbfs:/FileStore/tables/stream_write/order_year=2018","abfss://stream-data@ecommerceproject1.dfs.core.windows.net/order_year=2018",recurse=True)

# **Reading Batch Data**

In [0]:
class Extractor:
    def __init__(self):
        pass
    def extract(self):
        pass

class Order_items(Extractor):
    def extract(self):
        delta_table_path = "abfss://watermark@ecommerceproject1.dfs.core.windows.net/metadata/processed_timestamp"
        try:
            df = spark.read.format("delta").load(delta_table_path)
            last_processed_timestamp = df.select("last_processed_timestamp").collect()[0][0]
            print("Last Processed Timestamp:", last_processed_timestamp)
        except:
            last_processed_timestamp = 0  # Default to 0 if table doesn't exist
        files = dbutils.fs.ls("abfss://batch-data@ecommerceproject1.dfs.core.windows.net/olist_order_items_dataset/")
        new_files = [f for f in files if f.modificationTime > last_processed_timestamp]
        if new_files:
            print("New files found:", [f.path for f in new_files])
        else:
            print("No new files found.")
            return {}
        latest_mod_time = max(f.modificationTime for f in new_files)
        latest_files = [f for f in new_files if f.modificationTime == latest_mod_time]
        order_itemsDf = None
        for file in latest_files:
            order_itemsDf = get_DataSource("csv", file.path).getDataFrame()
            break  
        inputDf = {"order_itemsDf": order_itemsDf}
        spark.createDataFrame([(latest_mod_time,)], ["last_processed_timestamp"]) \
            .write.format("delta").mode("overwrite").save(delta_table_path)
        print("Updated Last Processed Timestamp:", latest_mod_time)
        return inputDf
    
class Customers(Extractor):
    def extract(self):
        customerDf = get_DataSource("csv","abfss://batch-data@ecommerceproject1.dfs.core.windows.net/olist_customers_dataset.csv").getDataFrame()
        
        inputDf = {"customerDf":customerDf}

        return inputDf
    
class Geolocation(Extractor):
    def extract(self):
        geolocationDf = get_DataSource("csv","abfss://batch-data@ecommerceproject1.dfs.core.windows.net/olist_geolocation_dataset.csv").getDataFrame()
        
        inputDf = {"geolocationDf":geolocationDf}

        return inputDf
    
class Products(Extractor):
    def extract(self):
        products_Df = get_DataSource("csv","abfss://batch-data@ecommerceproject1.dfs.core.windows.net/olist_products_dataset.csv").getDataFrame()

        inputDf = {"products_Df":products_Df}

        return inputDf

class Products_Category(Extractor):
    def extract(self):
        categroy_translation_Df = get_DataSource("csv","abfss://batch-data@ecommerceproject1.dfs.core.windows.net/product_category_name_translation.csv").getDataFrame()
        
        inputDf = {"categroy_translation_Df":categroy_translation_Df}

        return inputDf
    
class Sellers(Extractor):
    def extract(self):
        sellerDf = get_DataSource("csv","abfss://batch-data@ecommerceproject1.dfs.core.windows.net/olist_sellers_dataset.csv").getDataFrame()
        
        inputDf = {"sellerDf":sellerDf}

        return inputDf
    
class Payments(Extractor):
    def extract(self):
        paymentDf = get_DataSource("csv","abfss://batch-data@ecommerceproject1.dfs.core.windows.net/olist_order_payments_dataset.csv").getDataFrame()
        
        inputDf = {"paymentDf":paymentDf}

        return inputDf

In [0]:
from pyspark.sql.functions import year, month, col
from delta.tables import DeltaTable

class StreamingProcessor:
    def __init__(self, source_path, bronze_table_path):
        self.source_path = source_path
        self.bronze_table_path = bronze_table_path
        self.delta_table_path = bronze_table_path + "/delta_table/"
        self.schema_checkpoint = bronze_table_path + "/schema/"
        self.bronze_checkpoint = bronze_table_path + "/checkpoints/bronze/"

    def read_streaming_data(self):
        # Step 1: Read Incremental Data using AutoLoader
        streaming_df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("cloudFiles.schemaLocation", self.schema_checkpoint)
            .option("cloudFiles.inferColumnTypes", "true")
            .option("cloudFiles.schemaEvolutionMode", "rescue")
            .option("header", "true")
            .load(self.source_path))
        return streaming_df

    def partition_streaming_data(self, df):
        # Step 2: Add Partition Columns (Year & Month) for Storage Optimization
        partitioned_df = (df
            .withColumn("order_year", year(col("order_purchase_timestamp")))
            .withColumn("order_month", month(col("order_purchase_timestamp"))))
        return partitioned_df

    def upsert_to_bronze(self, micro_batch_df, batch_id):
        # Step 3: Upsert Function to Merge Data into Bronze Table
        if not DeltaTable.isDeltaTable(spark, self.delta_table_path):
            print(" Bronze Delta table not found. Creating a new one...")
            micro_batch_df.write.format("delta").mode("overwrite").partitionBy("order_year", "order_month").save(self.delta_table_path)
        else:
            print(f" Batch ID {batch_id}: Performing upsert...")
            bronze_table = DeltaTable.forPath(spark, self.delta_table_path)
            (bronze_table.alias("bronze")
                .merge(micro_batch_df.alias("source"),
                       "bronze.order_id = source.order_id")
                .whenMatchedUpdateAll()
                .whenNotMatchedInsertAll()
                .execute())

    def start_streaming_job(self):
        # Initialize Streaming and Upsert Logic
        streaming_df = self.read_streaming_data()
        partitioned_df = self.partition_streaming_data(streaming_df)
        upsert_query = (partitioned_df.writeStream
            .foreachBatch(self.upsert_to_bronze)
            .outputMode("append")
            .option("checkpointLocation", self.bronze_checkpoint)
            .trigger(availableNow=True)
            .start())
        return upsert_query

# # Initialize the Streaming Processor
source_path = "abfss://stream-data@ecommerceproject1.dfs.core.windows.net/"
bronze_table_path = "abfss://bronze@ecommerceproject1.dfs.core.windows.net/orders"
 
# stream_processor = StreamingProcessor(source_path, bronze_table_path)

# # Start the streaming job
# streaming_query = stream_processor.start_streaming_job()
# streaming_query.awaitTermination(20)


# **Read Streaming Incremental Data using AutoLoader**
