Upload CSV data into Databricks

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("Week4_ETL").getOrCreate()

In [0]:
orders_df = spark.read.format("csv").option("header", "true").load("/FileStore/tables/ordersdata.csv")
inventory_df = spark.read.format("csv").option("header", "true").load("/FileStore/tables/inventorydata.csv")

print("Data Loaded Successfully")

✅ Data Loaded Successfully


Run a notebook to clean and lter the data

In [0]:
# Remove duplicates
orders_clean = orders_df.dropDuplicates()
inventory_clean = inventory_df.dropDuplicates()

# Handle NULLs – drop rows with null values
orders_clean = orders_clean.na.drop()
inventory_clean = inventory_clean.na.drop()

# Filter only "Completed" orders and a specific date
orders_clean = orders_clean.filter(
    (col("status") == "Completed") & 
    (col("order_date") >= "2024-07-20")
)

print("Data Cleaned Successfully")


✅ Data Cleaned Successfully


Save cleaned output as Delta or CSV

In [0]:
orders_clean.write.format("delta").mode("overwrite").save("/FileStore/tables/ordersdata_clean_delta")
inventory_clean.write.format("delta").mode("overwrite").save("/FileStore/tables/inventorydata_clean_delta")

orders_clean.write.csv("/FileStore/tables/ordersdata_clean_csv", header=True, mode="overwrite")
inventory_clean.write.csv("/FileStore/tables/inventorydata_clean_csv", header=True, mode="overwrite")

print("Cleaned Data Saved")

✅ Cleaned Data Saved


In [0]:
orders_clean.createOrReplaceTempView("orders_clean")
inventory_clean.createOrReplaceTempView("inventory_clean")


Run basic analysis queries using SQL or PySpark

In [0]:
# Total completed orders
spark.sql("SELECT COUNT(*) as total_completed_orders FROM orders_clean").show()

# Orders by customer
spark.sql("""
    SELECT customer_id, COUNT(*) as order_count 
    FROM orders_clean 
    GROUP BY customer_id 
    ORDER BY order_count DESC
""").show()




+----------------------+
|total_completed_orders|
+----------------------+
|                     5|
+----------------------+

+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
|       C003|          2|
|       C001|          2|
|       C005|          1|
+-----------+-----------+



In [0]:
%sql
SELECT status, AVG(amount) AS avg_amount
FROM orders_clean
GROUP BY status;

status,avg_amount
Completed,380.0


In [0]:
%sql
--  Top 5 customers by spending
SELECT customer_id, SUM(amount) AS total_spent
FROM orders_clean
WHERE status = 'Completed'
GROUP BY customer_id
ORDER BY total_spent DESC
LIMIT 5;

customer_id,total_spent
C001,1000.0
C003,450.0
C005,450.0
