In [0]:
from pyspark.sql.functions import *

In [0]:
bronze_df = spark.read.format("parquet") \
    .option("inferSchema", "true")\
    .load("abfss://bronze@swap01storageaccount.dfs.core.windows.net/BmwCarSales/")

display(bronze_df)
bronze_df.show()

In [0]:
df = bronze_df.withColumn("Model_Category", split(col("Model_ID"), '-')[0])
display(df)

In [0]:
df = df.withColumn(
    "date",
    to_date(
        concat(
            col("Year"),
            lpad(col("Month"), 2, "0"),
            lpad(col("Day"), 2, "0")
        ),
        "yyyyMMdd"
    )
)

display(df)

In [0]:
df = df.withColumn(
    "Revenue_Per_Unit",
    when(col("Units_Sold") != 0,
         col("Revenue") / col("Units_Sold")
    ).otherwise(0)
)

In [0]:
df = df.withColumn(
    "Avg_Selling_Price",
    col("Revenue") / col("Units_Sold")
)

df.display()
df.show()

In [0]:
monthly_sales = df.groupBy(
    year("date").alias("Year"),
    month("date").alias("Month")
).sum("Revenue", "Units_Sold")

df.display()

In [0]:
branch_kpi = df.groupBy("Branch_ID", "BranchName") \
    .sum("Revenue", "Units_Sold")

In [0]:
import matplotlib.pyplot as plt

model_df = df.groupBy("Product_Name") \
    .sum("Revenue") \
    .toPandas()

plt.figure()
plt.bar(model_df["Product_Name"], model_df["sum(Revenue)"])
plt.xticks(rotation=45)
plt.title("Revenue by Car Model")
plt.show()

In [0]:
from pyspark.sql.functions import month, year

monthly_df = df.groupBy(
    year("date").alias("Year"),
    month("date").alias("Month")
).sum("Revenue").toPandas()

plt.figure()
plt.plot(monthly_df["Month"], monthly_df["sum(Revenue)"])
plt.title("Monthly Revenue Trend")
plt.show()

In [0]:
branch_df = df.groupBy("BranchName") \
    .sum("Units_Sold") \
    .toPandas()

plt.figure()
plt.bar(branch_df["BranchName"], branch_df["sum(Units_Sold)"])
plt.xticks(rotation=45)
plt.title("Units Sold by Branch")
plt.show()

In [0]:
rpu_df = df.groupBy("Product_Name") \
    .avg("Avg_Selling_Price") \
    .toPandas()

plt.figure()
plt.bar(rpu_df["Product_Name"], rpu_df["avg(Avg_Selling_Price)"])
plt.xticks(rotation=45)
plt.title("Revenue Per Unit by Model")
plt.show()

In [0]:
bmw_df = df.filter(col("Product_Name") == "BMW")

In [0]:
branch_perf = bmw_df.groupBy("BranchName") \
    .sum("Revenue", "Units_Sold")

In [0]:
import matplotlib.pyplot as plt

branch_pd = branch_perf.toPandas()

plt.figure()
plt.bar(branch_pd["BranchName"], branch_pd["sum(Revenue)"])
plt.xticks(rotation=45)
plt.title("BMW Revenue by Branch")
plt.show()

In [0]:
branch_perf.orderBy(col("sum(Revenue)").desc()).show()

In [0]:
from pyspark.sql.functions import rank
from pyspark.sql.window import Window

window_spec = Window.orderBy(col("sum(Revenue)").desc())

branch_ranked = branch_perf.withColumn(
    "Rank",
    rank().over(window_spec)
)

branch_ranked.show()

In [0]:
from pyspark.sql.functions import sum, col

yearly_sales = df.groupBy("Year") \
    .agg(sum("Units_Sold").alias("Total_Units_Sold")) \
    .orderBy("Year")

display(yearly_sales)

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import sum

yearly_kpi = df.groupBy("Year") \
    .agg(
        sum("Units_Sold").alias("Total_Units_Sold"),
        sum("Revenue").alias("Total_Revenue")
    ) \
    .orderBy("Year")

display(yearly_kpi)

Databricks visualization. Run in Databricks to view.

In [0]:
df.write.format("delta") \
    .mode("overwrite") \
    .save("abfss://silver@swap01storageaccount.dfs.core.windows.net/silver/allcarsales")

In [0]:
df.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("Product_Name") \
    .option("path", "abfss://silver@swap01storageaccount.dfs.core.windows.net/carsalesdata") \
    .save()