In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=33382f8b763fb2a9cc106baedf706b2eb00e753aa266f72bda5c1f48eb7c805a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

# Sample data for products
products = [
    (1, "Laptop", "Electronics", 50000),
    (2, "Smartphone", "Electronics", 30000),
    (3, "Table", "Furniture", 15000),
    (4, "Chair", "Furniture", 5000),
    (5, "Headphones", "Electronics", 2000),
]

# Sample data for sales transactions
sales = [
    (1, 1, 2),
    (2, 2, 1),
    (3, 3, 3),
    (4, 1, 1),
    (5, 4, 5),
    (6, 2, 2),
    (7, 5, 10),
    (8, 3, 1),
]

# Define schema for DataFrames
product_columns = ["ProductID", "ProductName", "Category", "Price"]
sales_columns = ["SaleID", "ProductID", "Quantity"]

# Create DataFrames
product_df = spark.createDataFrame(products, schema=product_columns)
sales_df = spark.createDataFrame(sales, schema=sales_columns)

# Show the DataFrames
print("Products DataFrame:")
product_df.show()

print("Sales DataFrame:")
sales_df.show()


Products DataFrame:
+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|        1|     Laptop|Electronics|50000|
|        2| Smartphone|Electronics|30000|
|        3|      Table|  Furniture|15000|
|        4|      Chair|  Furniture| 5000|
|        5| Headphones|Electronics| 2000|
+---------+-----------+-----------+-----+

Sales DataFrame:
+------+---------+--------+
|SaleID|ProductID|Quantity|
+------+---------+--------+
|     1|        1|       2|
|     2|        2|       1|
|     3|        3|       3|
|     4|        1|       1|
|     5|        4|       5|
|     6|        2|       2|
|     7|        5|      10|
|     8|        3|       1|
+------+---------+--------+



In [10]:
# Join product_df and sales_df on ProductID
joined_df = product_df.join(sales_df, on="ProductID")
joined_df.show()


+---------+-----------+-----------+-----+------+--------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|
+---------+-----------+-----------+-----+------+--------+
|        1|     Laptop|Electronics|50000|     1|       2|
|        1|     Laptop|Electronics|50000|     4|       1|
|        2| Smartphone|Electronics|30000|     2|       1|
|        2| Smartphone|Electronics|30000|     6|       2|
|        3|      Table|  Furniture|15000|     3|       3|
|        3|      Table|  Furniture|15000|     8|       1|
|        4|      Chair|  Furniture| 5000|     5|       5|
|        5| Headphones|Electronics| 2000|     7|      10|
+---------+-----------+-----------+-----+------+--------+



In [11]:
# Calculate total sales value
joined_df = joined_df.withColumn("TotalSalesValue", col("Price") * col("Quantity"))
joined_df.show()


+---------+-----------+-----------+-----+------+--------+---------------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|TotalSalesValue|
+---------+-----------+-----------+-----+------+--------+---------------+
|        1|     Laptop|Electronics|50000|     1|       2|         100000|
|        1|     Laptop|Electronics|50000|     4|       1|          50000|
|        2| Smartphone|Electronics|30000|     2|       1|          30000|
|        2| Smartphone|Electronics|30000|     6|       2|          60000|
|        3|      Table|  Furniture|15000|     3|       3|          45000|
|        3|      Table|  Furniture|15000|     8|       1|          15000|
|        4|      Chair|  Furniture| 5000|     5|       5|          25000|
|        5| Headphones|Electronics| 2000|     7|      10|          20000|
+---------+-----------+-----------+-----+------+--------+---------------+



In [12]:
# Group by category and calculate total sales value
category_sales_df = joined_df.groupBy("Category").agg(
    {"TotalSalesValue": "sum"}
).withColumnRenamed("sum(TotalSalesValue)", "TotalSalesValue")

category_sales_df.show()


+-----------+---------------+
|   Category|TotalSalesValue|
+-----------+---------------+
|Electronics|         260000|
|  Furniture|          85000|
+-----------+---------------+



In [13]:
# Group by product and calculate total sales value, then find the top-selling product
product_sales_df = joined_df.groupBy("ProductName").agg(
    {"TotalSalesValue": "sum"}
).withColumnRenamed("sum(TotalSalesValue)", "TotalSalesValue")

top_selling_product_df = product_sales_df.orderBy(col("TotalSalesValue").desc()).limit(1)
top_selling_product_df.show()


+-----------+---------------+
|ProductName|TotalSalesValue|
+-----------+---------------+
|     Laptop|         150000|
+-----------+---------------+



In [14]:
# Sort products by total sales value
sorted_products_df = product_sales_df.orderBy(col("TotalSalesValue").desc())
sorted_products_df.show()


+-----------+---------------+
|ProductName|TotalSalesValue|
+-----------+---------------+
|     Laptop|         150000|
| Smartphone|          90000|
|      Table|          60000|
|      Chair|          25000|
| Headphones|          20000|
+-----------+---------------+



In [15]:
# Count the number of sales transactions for each product
sales_count_df = sales_df.groupBy("ProductID").count().withColumnRenamed("count", "NumberOfSales")

# Join with product_df to get product details
product_sales_count_df = product_df.join(sales_count_df, on="ProductID")
product_sales_count_df.show()


+---------+-----------+-----------+-----+-------------+
|ProductID|ProductName|   Category|Price|NumberOfSales|
+---------+-----------+-----------+-----+-------------+
|        1|     Laptop|Electronics|50000|            2|
|        2| Smartphone|Electronics|30000|            2|
|        5| Headphones|Electronics| 2000|            1|
|        3|      Table|  Furniture|15000|            2|
|        4|      Chair|  Furniture| 5000|            1|
+---------+-----------+-----------+-----+-------------+



In [16]:
# Filter the products with total sales value greater than 50000
high_sales_df = product_sales_df.filter(col("TotalSalesValue") > 50000)
high_sales_df.show()


+-----------+---------------+
|ProductName|TotalSalesValue|
+-----------+---------------+
|     Laptop|         150000|
|      Table|          60000|
| Smartphone|          90000|
+-----------+---------------+

