In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# 1. Initialize Spark Session
# Using local[*] master to utilize all available cores in the Docker container
print("Initializing Spark Session...")
spark = SparkSession.builder \
    .appName("SimplePySparkTest") \
    .master("local[*]") \
    .getOrCreate()

# 2. Load Data
# Note: Since the volume is mounted, the script reads 'sales_data.csv' from the current directory.
try:
    print("Loading sales_data.csv...")
    df = spark.read.csv("datasets/sales_data.csv", header=True, inferSchema=True)
except Exception as e:
    print(f"Error loading CSV: {e}")
    # Exit cleanly if file not found
    spark.stop()
    exit()

# 3. Simple Transformation (Calculate Total Sale)
# Cast quantity and price to appropriate types for math (in case inferSchema failed)
df = df.withColumn("quantity", col("quantity").cast("integer")) \
       .withColumn("price", col("price").cast("float")) \
       .withColumn("total_sale", col("quantity") * col("price"))

print("\n--- Raw Data with Total Sale ---")
df.show()
df.printSchema()

# 4. Aggregation (Calculate Total Quantity Sold by Store)
summary_df = df.groupBy("store") \
               .sum("quantity") \
               .withColumnRenamed("sum(quantity)", "total_quantity_sold") \
               .orderBy(col("total_quantity_sold").desc())

print("\n--- Summary: Total Quantity Sold per Store ---")
summary_df.show()

# 5. Stop Spark Session
print("\nStopping Spark Session.")
spark.stop()

Initializing Spark Session...
Loading sales_data.csv...

--- Raw Data with Total Sale ---
+-----+-------+--------+-----+----------+
|store|product|quantity|price|total_sale|
+-----+-------+--------+-----+----------+
|    A|  apple|     100|  0.5|      50.0|
|    B| banana|     150| 0.25|      37.5|
|    A| banana|      50| 0.25|      12.5|
|    C|  apple|     200|  0.5|     100.0|
|    B|  grape|      75|  1.0|      75.0|
+-----+-------+--------+-----+----------+

root
 |-- store: string (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- total_sale: float (nullable = true)


--- Summary: Total Quantity Sold per Store ---
+-----+-------------------+
|store|total_quantity_sold|
+-----+-------------------+
|    B|                225|
|    C|                200|
|    A|                150|
+-----+-------------------+


Stopping Spark Session.
