In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=dfef1f178024d1593769167209797ab4ded1d34701c942c6912fca2bc149a96d
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

sc = spark.sparkContext

Step 2: Create and Explore the RDD

In [3]:
# Sales data
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]

sales_rdd = sc.parallelize(sales_data)


print(sales_rdd.take(4))


[('ProductA', 100), ('ProductB', 150), ('ProductA', 200), ('ProductC', 300)]


Grouping and aggregating data

In [5]:
# Group sales data by product name using groupByKey
grouped_sales_rdd = sales_rdd.groupByKey()

# Collect the grouped data and convert the iterator to a list for printing
grouped_sales = grouped_sales_rdd.mapValues(list).collect()

for product, sales in grouped_sales:
    print(f"Product: {product}, Sales: {sales}")



Product: ProductA, Sales: [100, 200]
Product: ProductB, Sales: [150, 250]
Product: ProductC, Sales: [300, 100]


calculating total sales by product

In [6]:
# Calculate total sales for each product using reduceByKey
total_sales_rdd = sales_rdd.reduceByKey(lambda a, b: a + b)

print(total_sales_rdd.collect())



[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]


Sort Products by Total Sale

In [7]:
# Sort products by total sales in descending order
sorted_sales_rdd = total_sales_rdd.sortBy(lambda x: x[1], ascending=False)

print(sorted_sales_rdd.collect())


[('ProductB', 400), ('ProductC', 400), ('ProductA', 300)]


Step 4: Additional Transformations

Filter Products with High Sales

In [8]:
# Filter products with total sales greater than 200
high_sales_rdd = total_sales_rdd.filter(lambda x: x[1] > 200)

print(high_sales_rdd.collect())


[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]


 Combine Regional Sales Data

In [9]:
# Regional sales data
regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]

regional_sales_rdd = sc.parallelize(regional_sales_data)

# Combine regional sales RDD with the original sales RDD using union
combined_rdd = sales_rdd.union(regional_sales_rdd)

# Calculate new total sales for each product
combined_total_sales_rdd = combined_rdd.reduceByKey(lambda a, b: a + b)

print(combined_total_sales_rdd.collect())


[('ProductA', 350), ('ProductC', 550), ('ProductB', 400)]


Step 5: Perform Actions on the RDD

count the distinct product

In [10]:
# Count the number of distinct products in the RDD
distinct_product_count = sales_rdd.keys().distinct().count()

print(distinct_product_count)


3


Identify the Product with Maximum Sales

In [14]:
# Find the product with the maximum total sales using reduce
max_sales_product = combined_total_sales_rdd.reduce(lambda a, b: a if a[1] > b[1] else b)

print(max_sales_product)


('ProductC', 550)


Calculate the Average Sales per Product

In [13]:

# Calculate average sales per product
product_count_rdd = sales_rdd.mapValues(lambda x: (x, 1))
total_and_count_rdd = product_count_rdd.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
average_sales_rdd = total_and_count_rdd.mapValues(lambda x: x[0] / x[1])

print(average_sales_rdd.collect())



[('ProductA', 150.0), ('ProductB', 200.0), ('ProductC', 200.0)]
