In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Top Products by Sales") \
    .getOrCreate()

# Load data
file_path = r"C:\Users\CompUser\OneDrive\Documents\TigsitPracticaltest\ecommerce\data.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Clean and preprocess data
data = data.withColumn("InvoiceDate", col("InvoiceDate").cast("timestamp"))
data = data.withColumn("Quantity", col("Quantity").cast("integer"))
data = data.withColumn("UnitPrice", col("UnitPrice").cast("double"))

# Calculate total sales by product
data = data.withColumn("TotalSales", col("Quantity") * col("UnitPrice"))
product_sales = data.groupBy("Description").agg(sum("TotalSales").alias("TotalSales"))

# Get top 10 products by sales
top_products = product_sales.orderBy(col("TotalSales").desc()).limit(10)

# Show results
top_products.show()

# Stop SparkSession
spark.stop()


+--------------------+------------------+
|         Description|        TotalSales|
+--------------------+------------------+
|      DOTCOM POSTAGE|206245.47999999998|
|REGENCY CAKESTAND...|         164762.19|
|WHITE HANGING HEA...| 99668.47000000013|
|       PARTY BUNTING| 98302.97999999992|
|JUMBO BAG RED RET...| 92356.02999999985|
|  RABBIT NIGHT LIGHT| 66756.58999999988|
|             POSTAGE| 66230.63999999998|
|PAPER CHAIN KIT 5...| 63791.94000000008|
|ASSORTED COLOUR B...|58959.730000000156|
|       CHILLI LIGHTS|          53768.06|
+--------------------+------------------+

