In [0]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("RetailStoreAnalysis").getOrCreate()
customers_data = [
    (1, "Rahul", "Bangalore", 25),
    (2, "Priya", "Delhi", 32),
    (3, "Aman", "Hyderabad", 29),
    (4, "Sneha", "Chennai", 35)
]
customers_cols = ["customer_id", "name", "city", "age"]

orders_data = [
    (101, 1, "Laptop", 55000),
    (102, 2, "Mobile", 25000),
    (103, 1, "Headphones", 3000),
    (104, 3, "Book", 700),
    (105, 4, "Chair", 5000),
    (106, 2, "Shoes", 2000)
]
orders_cols = ["order_id", "customer_id", "product", "amount"]

customers_df = spark.createDataFrame(customers_data, schema=customers_cols)
orders_df    = spark.createDataFrame(orders_data, schema=orders_cols)
print("customers")
customers_df.show()
print("orders")
orders_df.show()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
customers
+-----------+-----+---------+---+
|customer_id| name|     city|age|
+-----------+-----+---------+---+
|          1|Rahul|Bangalore| 25|
|          2|Priya|    Delhi| 32|
|          3| Aman|Hyderabad| 29|
|          4|Sneha|  Chennai| 35|
+-----------+-----+---------+---+

orders
+--------+-----------+----------+------+
|order_id|customer_id|   product|amount|
+--------+-----------+----------+------+
|     101|          1|    Laptop| 55000|
|     102|          2|    Mobile| 25000|
|     103|          1|Headphones|  3000|
|     104|          3|      Book|   700|
|     105|          4|     Chair|  5000|
|     106|          2|     Shoes|  2000|
+--------+-----------+----------+------+



In [0]:
customers_df.filter(F.col("age") > 30).show()

+-----------+-----+-------+---+
|customer_id| name|   city|age|
+-----------+-----+-------+---+
|          2|Priya|  Delhi| 32|
|          4|Sneha|Chennai| 35|
+-----------+-----+-------+---+



In [0]:
customers_df.select("city").distinct().show()

+---------+
|     city|
+---------+
|    Delhi|
|  Chennai|
|Bangalore|
|Hyderabad|
+---------+



In [0]:
orders_df.groupBy("customer_id").agg(F.sum("amount").alias("total_spent")).show()

+-----------+-----------+
|customer_id|total_spent|
+-----------+-----------+
|          2|      27000|
|          3|        700|
|          1|      58000|
|          4|       5000|
+-----------+-----------+



In [0]:
orders_df.agg(F.avg("amount").alias("avg_order_amount")).show()

+------------------+
|  avg_order_amount|
+------------------+
|15116.666666666666|
+------------------+



In [0]:
orders_df.orderBy(F.desc("amount")).limit(1).show()

+--------+-----------+-------+------+
|order_id|customer_id|product|amount|
+--------+-----------+-------+------+
|     101|          1| Laptop| 55000|
+--------+-----------+-------+------+



In [0]:
customers_df.join(orders_df, "customer_id").select("name","city","product","amount").show()

+-----+---------+----------+------+
| name|     city|   product|amount|
+-----+---------+----------+------+
|Rahul|Bangalore|    Laptop| 55000|
|Priya|    Delhi|    Mobile| 25000|
|Rahul|Bangalore|Headphones|  3000|
| Aman|Hyderabad|      Book|   700|
|Sneha|  Chennai|     Chair|  5000|
|Priya|    Delhi|     Shoes|  2000|
+-----+---------+----------+------+



In [0]:
orders_df.join(customers_df, "customer_id") \
    .groupBy("city").agg(F.sum("amount").alias("total_spent")) \
    .orderBy(F.desc("total_spent")).show(1)


+---------+-----------+
|     city|total_spent|
+---------+-----------+
|Bangalore|      58000|
+---------+-----------+
only showing top 1 row


In [0]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

spark.sql("""
    SELECT c.customer_id, c.name, SUM(o.amount) AS total_spent
    FROM customers c
    JOIN orders o
      ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.name
    ORDER BY total_spent DESC
    LIMIT 2
""").show()

+-----------+-----+-----------+
|customer_id| name|total_spent|
+-----------+-----+-----------+
|          1|Rahul|      58000|
|          2|Priya|      27000|
+-----------+-----+-----------+



In [0]:
output_path = "/content/orders_csv"
orders_df.write.mode("overwrite").option("header", True).csv(output_path)
spark.read.option("header", True).option("inferSchema", True).csv(output_path).show()

+--------+-----------+----------+------+
|order_id|customer_id|   product|amount|
+--------+-----------+----------+------+
|     103|          1|Headphones|  3000|
|     101|          1|    Laptop| 55000|
|     102|          2|    Mobile| 25000|
|     105|          4|     Chair|  5000|
|     106|          2|     Shoes|  2000|
|     104|          3|      Book|   700|
+--------+-----------+----------+------+

