In [0]:
# Load the CSV again (or reuse if reading from DBFS)
file_path = "dbfs:/FileStore/tables/Vipul/Sample___Superstore.csv"

df = spark.read.option("header", True).option("inferSchema", True).csv(file_path)


In [0]:
# Clean column names: remove spaces and convert to snake_case
for col_name in df.columns:
    df = df.withColumnRenamed(col_name, col_name.strip().lower().replace(" ", "_"))


In [0]:
from pyspark.sql.functions import round, col

df = df.withColumn("profit_margin", round(col("profit") / col("sales"), 2))


In [0]:
from pyspark.sql.functions import when

df = df.withColumn(
    "discount_category",
    when(col("discount") >= 0.3, "High")
    .when(col("discount") >= 0.1, "Medium")
    .otherwise("Low")
)


In [0]:
from pyspark.sql.functions import to_date, month, year, concat_ws

df = df.withColumn("order_date", to_date("order_date", "MM/dd/yyyy"))
df = df.withColumn("order_month", concat_ws("-", year("order_date"), month("order_date")))


In [0]:
df.show(5)
df.printSchema()

df.createOrReplaceTempView("clean_superstore")


In [0]:
df.select("order_id", "sales", "profit", "profit_margin", "discount", "discount_category", "order_month").show(10, truncate=False)


In [0]:
%sql
SELECT discount_category, COUNT(*) AS cnt
FROM clean_superstore
WHERE try_cast(quantity AS DOUBLE) >= 5
GROUP BY discount_category
ORDER BY cnt DESC

Databricks visualization. Run in Databricks to view.