In [0]:
csv_data = """transaction_id,customer_name,region,product,category,quantity,unit_price,date
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31
7,Aman,South,TV,Electronics,1,45000,2024-02-15
8,Isha,North,Notebook,Stationery,10,60,2024-01-10
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19
"""

dbutils.fs.put("dbfs:/tmp/sales_transactions.csv", csv_data, overwrite=True)


In [0]:
with open("/tmp/sales_transactions.csv", "w") as f:
    f.write(csv_data)

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/sales_transactions.csv")
df.display()



transaction_id,customer_name,region,product,category,quantity,unit_price,date
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31
7,Aman,South,TV,Electronics,1,45000,2024-02-15
8,Isha,North,Notebook,Stationery,10,60,2024-01-10
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19


In [0]:
df.write.mode("overwrite").parquet("dbfs:/tmp/sales_transactions_parquet")
df.display()

transaction_id,customer_name,region,product,category,quantity,unit_price,date
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31
7,Aman,South,TV,Electronics,1,45000,2024-02-15
8,Isha,North,Notebook,Stationery,10,60,2024-01-10
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19


In [0]:
df.write.format("delta").mode("overwrite").save("dbfs:/tmp/sales_transactions_delta")
df.display()

transaction_id,customer_name,region,product,category,quantity,unit_price,date
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31
7,Aman,South,TV,Electronics,1,45000,2024-02-15
8,Isha,North,Notebook,Stationery,10,60,2024-01-10
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19


In [0]:


spark.sql("DROP TABLE IF EXISTS sales_transactions")

spark.sql("""
    CREATE TABLE sales_transactions
    USING DELTA
    LOCATION 'dbfs:/tmp/sales_transactions_delta'
""")


In [0]:
from pyspark.sql.functions import col, month, date_format, when


df = spark.read.format("delta").load("dbfs:/tmp/sales_transactions_delta")

df_transformed = df.withColumn("total_amount", col("quantity") * col("unit_price")) \
    .withColumn("month", month("date")) \
    .withColumn("formatted_date", date_format("date", "dd-MMM-yyyy")) \
    .withColumn("is_high_value", when(col("total_amount") > 30000, True).otherwise(False))

df_transformed.display()


transaction_id,customer_name,region,product,category,quantity,unit_price,date,total_amount,month,formatted_date,is_high_value
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12,55000,1,12-Jan-2024,True
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05,32000,2,05-Feb-2024,True
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17,750,1,17-Jan-2024,False
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22,40000,3,22-Mar-2024,True
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28,28000,2,28-Feb-2024,False
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31,8000,1,31-Jan-2024,False
7,Aman,South,TV,Electronics,1,45000,2024-02-15,45000,2,15-Feb-2024,True
8,Isha,North,Notebook,Stationery,10,60,2024-01-10,600,1,10-Jan-2024,False
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05,200,3,05-Mar-2024,False
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19,600,3,19-Mar-2024,False


In [0]:


df_transformed.groupBy("region").count().withColumnRenamed("count", "transaction_count").display()


region,transaction_count
West,3
East,2
North,3
South,2


In [0]:
df_transformed.groupBy("category") \
    .sum("total_amount") \
    .withColumnRenamed("sum(total_amount)", "total_sales") \
    .orderBy(col("total_sales").desc()) \
    .limit(3) \
    .display()


category,total_sales
Electronics,200000
Fashion,8000
Personal Care,1350


In [0]:

df_transformed.groupBy("month") \
    .sum("total_amount") \
    .withColumnRenamed("sum(total_amount)", "monthly_revenue") \
    .orderBy("month") \
    .display()


month,monthly_revenue
1,64350
2,105000
3,40800


In [0]:

max_amount = df_transformed.agg({"total_amount": "max"}).collect()[0][0]

df_transformed.filter(col("total_amount") == max_amount).select("customer_name", "total_amount").display()


customer_name,total_amount
Rajesh,55000


In [0]:

df_transformed.filter(col("month").between(1, 3)) \
    .agg({"total_amount": "sum"}) \
    .withColumnRenamed("sum(total_amount)", "Q1_total_sales") \
    .display()

Q1_total_sales
210150


In [0]:

from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "dbfs:/tmp/sales_transactions_delta")

delta_table.update(
    condition="category = 'Stationery'",
    set={"unit_price": "unit_price * 1.10"}
)


In [0]:

delta_table.delete("quantity < 3")


In [0]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
today_date = datetime.today().strftime('%Y-%m-%d')
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("region", StringType(), True),
    StructField("product", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", IntegerType(), True),
    StructField("date", StringType(), True)
])

new_row = [(11, "Neha", "West", "Tablet", "Electronics", 1, 25000, today_date)]
new_df = spark.createDataFrame(new_row, schema=schema)
new_df.write.format("delta").mode("append").save("dbfs:/tmp/sales_transactions_delta")

In [0]:
df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("region") \
    .save("dbfs:/tmp/sales_transactions_partitioned_by_region")

spark.sql("DROP TABLE IF EXISTS sales_transactions_by_region")
spark.sql("""
    CREATE TABLE sales_transactions_by_region
    USING DELTA
    LOCATION 'dbfs:/tmp/sales_transactions_partitioned_by_region'
""")
