**Exercise: Working with Key-Value Pair RDDs in PySpark**

**Initialize SparkSession and SparkContext**

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=88eaeabbd672b502020b0029a674f3393d0c42d7ed718b7f15572006a9e9495a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("KeyValuePairRDDs") \
    .getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext


**Create and Explore the RDD**

**Task 1: Create an RDD from the Sales Data**

In [4]:
# Create an RDD from sales_data
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]

rdd_sales = sc.parallelize(sales_data)

# Print the first few elements of the RDD
print("Sales RDD:")
print(rdd_sales.take(5))


Sales RDD:
[('ProductA', 100), ('ProductB', 150), ('ProductA', 200), ('ProductC', 300), ('ProductB', 250)]


** Grouping and Aggregating Data**

**Task 2: Group Data by Product Name**

In [5]:
# Group data by product name
grouped_rdd = rdd_sales.groupByKey()

# Print the grouped data
print("Grouped Data:")
for product, sales in grouped_rdd.collect():
    print(f"{product}: {list(sales)}")


Grouped Data:
ProductA: [100, 200]
ProductB: [150, 250]
ProductC: [300, 100]


**Task 3: Calculate Total Sales by Product**

In [6]:
# Calculate total sales by product using reduceByKey
total_sales_rdd = rdd_sales.reduceByKey(lambda a, b: a + b)

# Print total sales for each product
print("Total Sales by Product:")
for product, total in total_sales_rdd.collect():
    print(f"{product}: {total}")


Total Sales by Product:
ProductA: 300
ProductB: 400
ProductC: 400


**Task 4: Sort Products by Total Sales**

In [7]:
# Sort products by total sales in descending order
sorted_sales_rdd = total_sales_rdd.sortBy(lambda x: x[1], ascending=False)

# Print sorted list of products and their sales amounts
print("Sorted Products by Total Sales:")
for product, total in sorted_sales_rdd.collect():
    print(f"{product}: {total}")


Sorted Products by Total Sales:
ProductB: 400
ProductC: 400
ProductA: 300


**Additional Transformations**

**Task 5: Filter Products with High Sales**

In [8]:
# Filter products with total sales greater than 200
high_sales_rdd = total_sales_rdd.filter(lambda x: x[1] > 200)

# Print products with high sales
print("Products with Sales Greater Than 200:")
for product, total in high_sales_rdd.collect():
    print(f"{product}: {total}")


Products with Sales Greater Than 200:
ProductA: 300
ProductB: 400
ProductC: 400


**Task 6: Combine Regional Sales Data**

In [9]:
# Create an RDD from regional_sales_data
regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]

rdd_regional = sc.parallelize(regional_sales_data)

# Combine the original sales RDD with the regional sales RDD
combined_rdd = rdd_sales.union(rdd_regional)

# Calculate the new total sales for each product
combined_total_sales_rdd = combined_rdd.reduceByKey(lambda a, b: a + b)

# Print the combined sales data
print("Combined Sales Data:")
for product, total in combined_total_sales_rdd.collect():
    print(f"{product}: {total}")


Combined Sales Data:
ProductA: 350
ProductC: 550
ProductB: 400


**Perform Actions on the RDD**

**Task 7: Count the Number of Distinct Products**

In [10]:
# Count the number of distinct products
distinct_products_count = rdd_sales.map(lambda x: x[0]).distinct().count()

print("Number of Distinct Products:")
print(distinct_products_count)


Number of Distinct Products:
3


**Task 8: Identify the Product with Maximum Sales**

In [11]:
# Find the product with the maximum total sales
max_sales_product = total_sales_rdd.reduce(lambda a, b: a if a[1] > b[1] else b)

print("Product with Maximum Sales:")
print(f"{max_sales_product[0]}: {max_sales_product[1]}")


Product with Maximum Sales:
ProductC: 400


**Challenge Task: Calculate the Average Sales per Product**

In [12]:
# Calculate the average sales per product
sales_count_rdd = rdd_sales.mapValues(lambda x: (x, 1)).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
average_sales_rdd = sales_count_rdd.mapValues(lambda x: x[0] / x[1])

# Print the average sales for each product
print("Average Sales per Product:")
for product, avg in average_sales_rdd.collect():
    print(f"{product}: {avg:.2f}")


Average Sales per Product:
ProductA: 150.00
ProductB: 200.00
ProductC: 200.00
