In [0]:
from pyspark.sql.functions import col, round



In [0]:
# Accessing sales table from bronze schema under retail_poc catalog
sales_df = spark.table('retail_poc.bronze.sales')
#sales_df.display(limit=10)


In [0]:
#calculated Margin% = Net_Margin / Net_Sales * 100 , results were saved in new column named 'Margin %' in sales_df
sales_df = sales_df.withColumn(
    "Margin_%",
    round((col("Net_Margin") / col("Net_Sales")) * 100, 2)
)


In [0]:
#accessing store_master table from bronze schema under retail_poc catalog 
#joining sales_df with store_master_df based on store_code (LEFT JOIN)


store_master_df = spark.table("retail_poc.bronze.store_master")

joined_df = sales_df.join(
    store_master_df.select("store_name", "region"),
    sales_df.Store_code == col('store_code'),
    how="left"
)

display(joined_df)

In [0]:
silver_df = joined_df.select(
    sales_df.Store_code.alias("Store_code"),
    "region",
    "Month",
    "Year",
    col("Sale_QTY").alias("Units_sold"),
    "COGS",
    col("Sale_iV").alias("Total_Sales"),    # Rename on the fly if needed
    "Net_Margin",
    "Margin_%",
)

In [0]:
silver_product_df = sales_df.withColumnRenamed("Sale_QTY", "Units_Sold") \
    .withColumnRenamed("Net_Margin", "Total_Margin") \
    .select(
        "EAN",
        "Category",
        "Sub_Category",
        "Gender",
        "Units_Sold",
        "Margin_%",
        "Total_Margin"
    )

In [0]:
%python
total_duplicate_silver_df = silver_df.count() - silver_df.dropDuplicates().count()
total_duplicate_product_df = silver_product_df.count() - silver_product_df.dropDuplicates().count()
print(f"total_duplicate_silver_df: {total_duplicate_silver_df}")
print(f"total_duplicate_product_df: {total_duplicate_product_df}")

In [0]:
deduplicated_silver_df = silver_df.dropDuplicates()
deduplicated_product_df = silver_product_df.dropDuplicates()

In [0]:
%sql
USE CATALOG retail_poc;
CREATE SCHEMA IF NOT EXISTS retail_poc.silver;

In [0]:
# Save deduplicated sales-based DataFrame
deduplicated_silver_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("retail_poc.silver.store_sales_summary")

# Save deduplicated product-level DataFrame
deduplicated_product_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("retail_poc.silver.product_sale_summary")
