In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead,col,broadcast,collect_set,size,array_contains,collect_list,min,datediff, avg,rank,when,count,coalesce,lit,desc,sum,countDistinct,udf,date_format,approx_count_distinct
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType
import re
import os
from delta.tables import DeltaTable
from pyspark.sql.streaming import StreamingQuery

In [0]:
%run "./Bronze/"

In [0]:
class IncrementalPreProcessor:
    def __init__(self, bronze_table_path, silver_table_path):
        self.bronze_table_path = bronze_table_path
        self.silver_table_path = silver_table_path
        self.checkpoint_path =  silver_table_path + "/checkpoint/"

    def preprocess_stream(self):
        # Step 1: Read from Bronze Delta table as a Stream
        bronze_stream = (spark.readStream
            .format("delta")
            .load(self.bronze_table_path))

        # Step 2: Apply Preprocessing (drop columns, fill nulls, etc.)
        processed_stream = (bronze_stream
            .drop("order_approved_at", "order_delivered_carrier_date", "order_delivered_customer_date", "order_estimated_delivery_date","_rescued_data")
            .fillna({"order_status": "unavailable"}) 
        )

        # Step 3: Write preprocessed data to Silver Delta Table
        query = (processed_stream.writeStream
            .format("delta")
            .outputMode("append")
            # .queryName("Preprocessing Stream data")
            .option("checkpointLocation", self.checkpoint_path)
            .start(self.silver_table_path))

        return query

# Paths for Delta tables
# bronze_table_path = "abfss://bronze@ecommerceproject2.dfs.core.windows.net/orders/delta_table"
# silver_table_path = "abfss://silver@ecommerceproject2.dfs.core.windows.net/orders"

# # Start Streaming Preprocessing
# processor = IncrementalPreProcessor(bronze_table_path, silver_table_path)
# query = processor.preprocess_stream()
# query.awaitTermination(30)


In [0]:
class PreProcessPipeline:
    def __init__(self, storage_base_path: str, is_incremental: bool):
        self.storage_base_path = storage_base_path.rstrip('/')  
        self.is_incremental = is_incremental 

    def preprocess_data(
        self, df: DataFrame, drop_columns: list = None, fill_na_dict: dict = None, 
        drop_duplicates_columns: list = None, replace_dict: dict = None) -> DataFrame:
        
        if df is None:
            print("⚠ Warning: Received NoneType DataFrame. Returning previously pre-processed data.")
            return self.previous_df if hasattr(self, 'previous_df') else None
        
        if drop_columns:
            print(f"Dropping columns: {drop_columns}")
            df = df.drop(*drop_columns)

        if fill_na_dict:
            print(f"Filling nulls with: {fill_na_dict}")
            df = df.fillna(fill_na_dict)
        
        if drop_duplicates_columns:
            print(f"Dropping duplicate records based on columns: {drop_duplicates_columns}")
            df = df.dropDuplicates(drop_duplicates_columns)
        
        if replace_dict:
            for column, replacements in replace_dict.items():
                print(f"Replacing values in column '{column}': {replacements}")
                df = df.replace(replacements, subset=[column])
        
        self.previous_df = df  # Store the latest processed DataFrame
        return df
    
    def store_as_delta(self, df: DataFrame, table_name: str, file_name: str = None):
        if df is None:
            print("⚠ Warning: Received NoneType DataFrame. Skipping Delta storage.")
            return
        
        delta_table_path = f"{self.storage_base_path}/{table_name}"

        if self.is_incremental:
            df = df.withColumn("filename", lit(file_name))  # Add filename column

            if DeltaTable.isDeltaTable(spark, delta_table_path):
                delta_table = DeltaTable.forPath(spark, delta_table_path)
                delta_table.alias("target").merge(
                    df.alias("source"),
                    "target.filename = source.filename"
                ).whenNotMatchedInsertAll().execute()
                print(f"✔ File '{file_name}' appended to Delta table at {delta_table_path}")
            else:
                df.write.format("delta").mode("overwrite").partitionBy("filename").save(delta_table_path)
                print(f"Delta table created at {delta_table_path} with first file: {file_name}")
        else:
            if DeltaTable.isDeltaTable(spark, delta_table_path):
                print(f"⚠ Delta table already exists at {delta_table_path}. Skipping overwrite.")
            else:
                df.write.format("delta").mode("overwrite").save(delta_table_path)
                print(f"Data stored as a new Delta table at {delta_table_path}")

    def run_pipeline(
        self, df: DataFrame, table_name: str, drop_columns: list = None, 
        fill_na_dict: dict = None, drop_duplicates_columns: list = None, replace_dict: dict = None
    ):
        print(f"Starting PreProcess pipeline for table: {table_name}")
        
        if df is None:
            print("⚠ Warning: Received NoneType DataFrame. Using previously pre-processed data.")
            df = self.previous_df if hasattr(self, 'previous_df') else None
            if df is None:
                print("No previously pre-processed data available. Skipping pipeline execution.")
                return
        
        file_name = "Single_Batch"
        if self.is_incremental:
            try:
                file_path = df.inputFiles()[0] if df.inputFiles() else "Unknown_File"
            except AttributeError:
                file_path = "Unknown_File"
            
            file_name = os.path.basename(file_path).split(".")[0]  # Extract filename without extension
            if file_name == "Unknown_File":
                print("⚠ Warning: Could not determine input file name!")

        print(f"Processing file: {file_name}")
        
        processed_df = self.preprocess_data(df, drop_columns, fill_na_dict, drop_duplicates_columns, replace_dict)
        self.store_as_delta(processed_df, table_name, file_name)
        
        print(f"Pre-Processing completed for file: {file_name}")

storage_base_path = "abfss://silver@ecommerceproject2.dfs.core.windows.net/"

In [0]:
def get_preprocessed_order_items(inputDf: dict) -> DataFrame:
    config = {"table_name": "order_items",                  
        "drop_columns": ["shipping_limit_date"],
        "fill_na_dict": {"price": 0.0, "freight_value": 0.0},
        "drop_duplicates_columns": [],
        "replace_dict": {}}
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=True)
    pipeline.run_pipeline(inputDf.get("order_itemsDf"), config["table_name"], config["drop_columns"], 
                          config["fill_na_dict"], drop_duplicates_columns=config["drop_duplicates_columns"], 
                          replace_dict=config["replace_dict"])
    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    return spark.read.format("delta").load(delta_table_path)

In [0]:
def get_preprocessed_sellers(inputDf: dict) -> DataFrame:
    config = {
        "table_name": "sellers",                  
        "drop_columns": [],
        "fill_na_dict": {},
        "drop_duplicates_columns": [],
        "replace_dict": {} }
    
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=False)
    
    pipeline.run_pipeline(
        inputDf.get("sellerDf"), 
        config["table_name"], 
        config["drop_columns"], 
        config["fill_na_dict"], 
        drop_duplicates_columns=config["drop_duplicates_columns"], 
        replace_dict=config["replace_dict"]
    )

    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    
    return spark.read.format("delta").load(delta_table_path)

In [0]:
def get_preprocessed_payments(inputDf: dict) -> DataFrame:
    config = {
        "table_name": "payment",                  
        "drop_columns": ["payment_sequential", "payment_value"],
        "fill_na_dict": {},
        "drop_duplicates_columns": [], 
        "replace_dict": {"payment_type": {"boleto": "cash"}}}
    
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=False)
    
    pipeline.run_pipeline(
        inputDf.get("paymentDf"), 
        config["table_name"], 
        config["drop_columns"], 
        config["fill_na_dict"],
        drop_duplicates_columns=config["drop_duplicates_columns"],  
        replace_dict=config["replace_dict"])

    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    
    return spark.read.format("delta").load(delta_table_path)

In [0]:
def get_preprocessed_products(inputDf: dict) -> DataFrame:
    config = {
        "table_name": "products",                  
        "drop_columns": [
            "product_name_lenght", "product_description_lenght", 
            "product_photos_qty", "product_weight_g", "product_length_cm", 
            "product_height_cm", "product_width_cm"
        ],
        "fill_na_dict": {"product_category_name": "unknown"},
        "drop_duplicates_columns": [],
        "replace_dict": {}
    }
    
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=False)
    
    pipeline.run_pipeline(
        inputDf.get("products_Df"), 
        config["table_name"], 
        config["drop_columns"], 
        config["fill_na_dict"], 
        drop_duplicates_columns=config["drop_duplicates_columns"], 
        replace_dict=config["replace_dict"]
    )
    
    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    
    return spark.read.format("delta").load(delta_table_path)


In [0]:
def get_preprocessed_customers(inputDf: dict) -> DataFrame:
    config = {
        "table_name": "customers",                  
        "drop_columns": ["customer_unique_id"],
        "fill_na_dict": {},
        "drop_duplicates_columns": [],
        "replace_dict": {}
    }
    
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=False)
    
    pipeline.run_pipeline(
        inputDf.get("customerDf"), 
        config["table_name"], 
        config["drop_columns"], 
        config["fill_na_dict"], 
        drop_duplicates_columns=config["drop_duplicates_columns"], 
        replace_dict=config["replace_dict"]
    )
    
    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    
    return spark.read.format("delta").load(delta_table_path)


In [0]:
def get_preprocessed_product_category(inputDf: dict) -> DataFrame:
    config = {
        "table_name": "product_category",                  
        "drop_columns": [],
        "fill_na_dict": {},
        "drop_duplicates_columns": [],
        "replace_dict": {}
    }
    
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=False)
    
    pipeline.run_pipeline(
        inputDf.get("categroy_translation_Df"), 
        config["table_name"], 
        config["drop_columns"], 
        config["fill_na_dict"], 
        drop_duplicates_columns=config["drop_duplicates_columns"], 
        replace_dict=config["replace_dict"]
    )
    
    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    
    return spark.read.format("delta").load(delta_table_path)


In [0]:
def get_preprocessed_geolocation(inputDf: dict) -> DataFrame:
    config = {
        "table_name": "geolocation_cleaned",
        "drop_columns": ["geolocation_lat", "geolocation_lng"], 
        "fill_na_dict": {},  
        "drop_duplicates_columns": ["geolocation_zip_code_prefix"],
        "replace_dict": {"geolocation_city": {
                'á': 'a', 'à': 'a', 'ã': 'a', 'â': 'a', 'ä': 'a',
                'é': 'e', 'è': 'e', 'ê': 'e',
                'í': 'i', 'ì': 'i', 'î': 'i',
                'ó': 'o', 'ò': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o',
                'ú': 'u', 'ù': 'u', 'ü': 'u',
                'ç': 'c', 'ñ': 'n'}}}
    pipeline = PreProcessPipeline(storage_base_path, is_incremental=False)
    pipeline.run_pipeline(
        inputDf.get("geolocationDf"), 
        config["table_name"], 
        config["drop_columns"], 
        config["fill_na_dict"], 
        drop_duplicates_columns=config["drop_duplicates_columns"], 
        replace_dict=config["replace_dict"])
    delta_table_path = f"{storage_base_path}/{config['table_name']}"
    print(f"Loading preprocessed table from Delta path: {delta_table_path}")
    return spark.read.format("delta").load(delta_table_path)


In [0]:
# class Transformer:
#     def __init__(self):
#         pass
#     def transform(self, inputDf):
#         raise NotImplementedError("transform() not implemented")

# class TotalRevenueTransformer(Transformer):
#     """Total revenue from orders"""
#     def transform(self, preprocessed_df: DataFrame):
#         transformed_df = preprocessed_df.withColumn("Item_Total", col("price") + col("freight_value"))
#         revenue_df = transformed_df.groupby("order_id").agg(sum("Item_Total").alias("Total_price"))
#         print("Total revenue per order:")
#         rdf=revenue_df.select(sum("Total_price").alias("total_revenue"))
#         # rdf.display()
#         return rdf
    
# class RevenueByProductCategoryTransformer(Transformer):
#     """Total revenue from each product category"""
#     def transform(self, joined_df: DataFrame):
#         transformed_df = joined_df.withColumn("Item_Total", col("price") + col("freight_value"))
#         revenue_df = transformed_df.groupby("product_category_name_english").agg(sum("Item_Total").alias("Category_Revenue"))
#         print("Total revenue per category:")
#         rdf=revenue_df.orderBy("Category_Revenue", ascending=False)
#         rdf.display()
#         return rdf
    
# class TopSellingProduct(Transformer):
#     """Top Selling Products"""
#     def transform(self, preprocessed_df: DataFrame):
#         # Use the same preprocessed table.
#         transformed_df = preprocessed_df.withColumn("Item_Total", col("price") + col("freight_value"))
#         result_df = transformed_df.groupby("product_id").agg(count("order_item_id").alias("order_count"))
#         print("Top Selling Products (by order count):")
#         rdf=result_df.orderBy(desc("order_count"))
#         rdf.display()
#         return rdf

# class RevenueTrendOverTimeTransformer(Transformer):
#     """Daily revenue from orders"""
#     def transform(self, joined_df: DataFrame):
#         transformed_df = joined_df.withColumn("Item_Total", col("price") + col("freight_value"))
#         revenue_df = transformed_df.groupby(date_format("order_purchase_timestamp", "yyyy-MM-dd").alias("date")).agg(sum("Item_Total").alias("Daily_Revenue"))
#         print("Daily revenue :")
#         rdf=revenue_df.orderBy("Daily_Revenue", ascending=False)
#         rdf.display()
#         return rdf
    
# class GeographicRevenueTransformer(Transformer):
#     """State wise revenue from orders"""
#     def transform(self, joined_df: DataFrame):
#         transformed_df = joined_df.withColumn("Item_Total", col("price") + col("freight_value"))
#         revenue_df = transformed_df.groupby("customer_state").agg(sum("Item_Total").alias("State_Revenue"))
#         print("State wise revenue :")
#         rdf=revenue_df.orderBy("State_Revenue", ascending=False)
#         rdf.display()
#         return rdf
    
# class CustomerValueTransformer(Transformer):
#     """State wise revenue from orders"""
#     def transform(self, joined_df: DataFrame):
#         transformed_df = joined_df.withColumn("Item_Total", col("price") + col("freight_value"))
#         revenue_df = transformed_df.groupby("customer_id").agg(sum("Item_Total").alias("Customer_Spent"))
#         revenue_df = revenue_df.withColumn(
#             "Customer_Category",
#             when((col("Customer_Spent") >= 5001), "High")
#             .when((col("Customer_Spent") >= 2001) & (col("Customer_Spent") <= 5000), "Mid")
#             .when((col("Customer_Spent") >= 0) & (col("Customer_Spent") <= 2000), "Low")
#             .otherwise("Unknown")
#         )
#         print("Customer Revenue and Category:")
#         rdf=revenue_df.orderBy(col("Customer_Spent").desc())
#         rdf.display()
#         return rdf
    
# class AverageOrderValueTransformer(Transformer):
#     """Calculates Average Order Value (AOV)"""
#     def transform(self, order_items):
#         order_items = order_items.withColumn("Item_Total", col("price") + col("freight_value"))
#         order_totals_df = order_items.groupBy("order_id").agg(sum("Item_Total").alias("order_total"))
#         aov_df = order_totals_df.selectExpr("SUM(order_total) / COUNT(order_id) AS AOV")
#         aov_df.display()
#         return aov_df

# class OrderReturnRateTransformer(Transformer):
#     """Calculates Order Return Rate"""
#     def transform(self, orders_df):
#         return_rate_df = orders_df.agg(
#             (sum(when(col("order_status") == "canceled", 1).otherwise(0)) / count("order_id"))
#             .alias("Order_Return_Rate")
#         )
#         return_rate_df.display()
#         return return_rate_df
    
# class CustomerRetentionTransformer(Transformer):
#     def transform(self, joined_df):
#         cust_retention = (joined_df.groupby("customer_id")
#                           .agg(count("order_id").alias("total_orders"))
#                           .filter("total_orders > 1")
#                           .orderBy("total_orders", ascending=False))
#         cust_retention.display()            
#         return cust_retention

In [0]:
def create_stream_static_join(stream_table_name: DataFrame, static_table_name: DataFrame, stream_join_column: str, static_join_column: str,output_table_name: str, output_checkpoint_path: str) -> 'StreamingQuery':
    # Step 1: Read streaming data from Unity Catalog
    stream_df =stream_table_name

    # Step 2: Read static batch data from Unity Catalog
    static_df = static_table_name

    # Step 3: Perform the join (default shuffle join)
    joined_stream = (stream_df
        .join(static_df, stream_df[stream_join_column] == static_df[static_join_column], "left") ) # Left join

    # Step 4: Select all columns from stream_df and all columns from static_df except the join column
    selected_columns = [stream_df[col] for col in stream_df.columns] + \
                       [static_df[col] for col in static_df.columns if col != static_join_column]
    joined_stream = joined_stream.select(*selected_columns)

    # Step 4: Write the result to a Delta table with checkpointing
    query = (joined_stream.writeStream
        .format("delta")
        .outputMode("append")
        .option("checkpointLocation", output_checkpoint_path)
        .option("mergeSchema", "true")
        .start(output_table_name) )
    return query


In [0]:
def order_items_join(order_df: DataFrame, order_items_df: DataFrame) -> DataFrame:
    stream_table_name =order_df
    static_table_name = order_items_df
    stream_join_column = "order_id" 
    static_join_column = "order_id" 
    output_table_name = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/orders_items_join/delta_table" 
    output_checkpoint_path = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/orders_items_join/checkpoint" 
    query = create_stream_static_join(stream_table_name, static_table_name, stream_join_column,static_join_column, output_table_name, output_checkpoint_path)
    spark.sql("create catalog if not exists eco")
    spark.sql("create schema if not exists eco.silver")
    spark.sql("create table if not exists eco.silver.orders_join location 'abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/orders_items_join/delta_table'")
    query.awaitTermination(20)
    joined_stream_df = spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/orders_items_join/delta_table")
    return joined_stream_df

In [0]:
def order_payments_join(order_df: DataFrame, order_payments) -> DataFrame:
    stream_table_name =order_df
    static_table_name = order_payments
    stream_join_column = "order_id" 
    static_join_column = "order_id" 
    output_table_name = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_payments_join/delta_table" 
    output_checkpoint_path = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_payments_join/checkpoint" 
    query = create_stream_static_join(stream_table_name, static_table_name, stream_join_column,static_join_column, output_table_name, output_checkpoint_path)
    spark.sql("create catalog if not exists eco")
    spark.sql("create schema if not exists eco.silver")
    spark.sql("create table if not exists eco.silver.order_payments location 'abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_payments_join/delta_table'")
    query.awaitTermination(20)
    joined_stream_df = spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_payments_join/delta_table")
    return joined_stream_df

In [0]:
def order_seller_join(orders:DataFrame,order_items:DataFrame,sellers) -> DataFrame:
    stream_table_name = order_items_join(orders,order_items)
    static_table_name = sellers
    stream_join_column = "seller_id" 
    static_join_column = "seller_id" 
    output_table_name = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_seller/delta_table"  
    output_checkpoint_path = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_seller/checkpoint" 
    query = create_stream_static_join(stream_table_name, static_table_name,stream_join_column,static_join_column, output_table_name, output_checkpoint_path)
    spark.sql("create table if not exists eco.silver.order_seller location 'abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_seller/delta_table'")
    query.awaitTermination(20)
    joined_stream_df = spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/order_seller/delta_table")
    return joined_stream_df

In [0]:
def category_revenue_join(orders:DataFrame,order_items:DataFrame,products,product_categories) -> DataFrame:
    stream_table_name = order_items_join(orders,order_items)
    joined_df=products.join(product_categories, products.product_category_name == product_categories.product_category_name).select(products["product_id"],products["product_name"],product_categories["product_category_name_english"])
    static_table_name = joined_df 
    stream_join_column = "product_id" 
    static_join_column = "product_id" 
    output_table_name = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/category_revenue/delta_table"  
    output_checkpoint_path = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/category_revenue/checkpoint" 
    query = create_stream_static_join(stream_table_name, static_table_name,stream_join_column,static_join_column, output_table_name, output_checkpoint_path)
    spark.sql("create table if not exists eco.silver.category_revenue location 'abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/category_revenue/delta_table'")
    query.awaitTermination(20)
    joined_stream_df = spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/category_revenue/delta_table")
    return joined_stream_df

In [0]:
# source_path = "abfss://stream-data@ecommerceproject2.dfs.core.windows.net/"
# bronze_table_path = "abfss://bronze@ecommerceproject2.dfs.core.windows.net/orders"
# delta_table_path="abfss://bronze@ecommerceproject2.dfs.core.windows.net/orders/delta_table/"
# silver_table_path = "abfss://silver@ecommerceproject2.dfs.core.windows.net/orders/"

# extractor = Order_items()
# orderitemDf = extractor.extract()
# extractor = Products_Category()
# productcatDf = extractor.extract()
# extractor = Products()
# productDf = extractor.extract()
# extractor = StreamingProcessor(source_path, bronze_table_path)
# orderDf = extractor.start_streaming_job()
# orderDf.awaitTermination(30)

# #Tranformation: Preprocessing
# preprocessed_products_df = get_preprocessed_products(productDf)
# preprocessed_product_category_df = get_preprocessed_product_category(productcatDf)
# preprocessed_order_items_df = get_preprocessed_order_items(orderitemDf)
# preprocessed_order_df=IncrementalPreProcessor(delta_table_path, silver_table_path)
# query=preprocessed_order_df.preprocess_stream()
# query.awaitTermination(30)
# preprocessed_order_df=spark.readStream.format("delta").load(silver_table_path)

# #Tranformation: Joins
# joined_df = category_revenue_join(preprocessed_order_df,preprocessed_order_items_df,preprocessed_products_df,preprocessed_product_category_df)
# stream_joindf=spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/category_revenue/delta_table")

# stream_joindf.display()

In [0]:
def customer_spending_join(orders:DataFrame,order_items:DataFrame,customers:DataFrame) -> DataFrame:
    stream_table_name = order_items_join(orders,order_items) 
    static_table_name = customers  
    stream_join_column = "customer_id" 
    static_join_column = "customer_id" 
    output_table_name = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/customer_spending/delta_table"  
    output_checkpoint_path = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/customer_spending/checkpoint" 
    query = create_stream_static_join(stream_table_name, static_table_name,stream_join_column,static_join_column, output_table_name, output_checkpoint_path) 
    spark.sql("create table if not exists eco.silver.customer_spending location 'abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/customer_spending/delta_table'")
    query.awaitTermination(20)
    joined_stream_df = spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/customer_spending/delta_table")
    return joined_stream_df

In [0]:
def geographic_revenue_join(orders,order_items,customers,geolocation):
    stream_table_name = customer_spending_join(orders,order_items,customers)
    static_table_name = geolocation  
    stream_join_column = "customer_zip_code_prefix" 
    static_join_column = "geolocation_zip_code_prefix" 
    output_table_name = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/geographic_revenue/delta_table"  
    output_checkpoint_path = "abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/geographic_revenue/checkpoint" 
    # Create and start the stream-static join job using the factory function
    query = create_stream_static_join(stream_table_name, static_table_name, stream_join_column,static_join_column, output_table_name, output_checkpoint_path)
    spark.sql("create table if not exists eco.silver.geo_revenue location 'abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/geographic_revenue/delta_table'")
    query.awaitTermination(20)
    joined_stream_df = spark.readStream.format("delta").load("abfss://gold@ecommerceproject2.dfs.core.windows.net/joins/geographic_revenue/delta_table")
    return joined_stream_df