In [0]:
silver_base = "/Volumes/practise/ecommerce/silver"
gold_base = "/Volumes/practise/ecommerce/gold"

In [0]:
customers_df = spark.read.format("delta").load(silver_base+"/customers")
products_df = spark.read.format("delta").load(silver_base+"/products")
orders_df = spark.read.format("delta").load(silver_base+"/orders")
sales_df = spark.read.format("delta").load(silver_base+"/sales")

In [0]:
from pyspark.sql.functions import col

Overall_df = (
    sales_df
    .join(orders_df, on='order_id', how='inner')
    .join(customers_df, on='customer_id', how='inner')
    .join(products_df, sales_df.product_id == products_df.product_ID, how='inner')
    .select(
        "order_id", "customer_id", "customer_name", "gender", "age", "home_address",
        "zip_code", "city", "state", "country", "payment", "order_date", "delivery_date",
        "sales_id", "price_per_unit", sales_df["product_id"], sales_df["quantity"].alias("sales_quantity"), "total_price",
        "product_type", "product_name", "size", "colour", products_df["price"].alias("product_price"),
        products_df["quantity"].alias("product_stock"), "description"
    )
)


In [0]:
Overall_df.write.mode("overwrite").format("delta").save(gold_base+"/overall")

In [0]:
from pyspark.sql.functions import col, sum, desc

# Top Customers from final_df
top_customers = (
    Overall_df.groupBy("customer_id", "customer_name")
            .agg(sum("payment").alias("total_spent"))
            .orderBy(desc("total_spent"))
            .limit(10)
)
top_customers.write.format("delta").mode("overwrite").save(f"{gold_base}/top_customers")

# Sales by Category from final_df
sales_by_category = (
    Overall_df.groupBy("product_type")
            .agg(sum("total_price").alias("total_sales"))
            .orderBy(desc("total_sales"))
)
sales_by_category.write.format("delta").mode("overwrite").save(f"{gold_base}/sales_by_category")
