In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=5e1e382d09834dc70aab95d7e4b0ea06836effb997cc7264cc6714b8b423c85d
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [27]:
# ### **Exercise: Working with Key-Value Pair RDDs in PySpark**
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]
regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]

In [28]:
# ### **Step 1: Initialize Spark Context**

# 1. **Initialize SparkSession and SparkContext:**
#    - Create a Spark session in PySpark and use the `spark.sparkContext` to create an RDD from the provided data.

from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Exercise:03").getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext

In [29]:
# ### **Step 2: Create and Explore the RDD**

# 2. **Task 1: Create an RDD from the Sales Data**
#    - Create an RDD from the `sales_data` list provided above.
#    - Print the first few elements of the RDD.

sales_rdd = sc.parallelize(sales_data)
regional_sales_rdd = sc.parallelize(regional_sales_data)

print(sales_rdd.take(5))
print(regional_sales_rdd.take(5))

[('ProductA', 100), ('ProductB', 150), ('ProductA', 200), ('ProductC', 300), ('ProductB', 250)]
[('ProductA', 50), ('ProductC', 150)]


In [30]:
# ### **Step 3: Grouping and Aggregating Data**

# 3. **Task 2: Group Data by Product Name**
#    - Group the sales data by product name using `groupByKey()`.
#    - Print the grouped data to understand its structure.

grouped_sales = sales_rdd.groupByKey().mapValues(list)
grouped_regional_sales = regional_sales_rdd.groupByKey().mapValues(list)

print(grouped_sales.collect())
print(grouped_regional_sales.collect())

# 4. **Task 3: Calculate Total Sales by Product**
#    - Use `reduceByKey()` to calculate the total sales for each product.
#    - Print the total sales for each product.

total_sales = sales_rdd.reduceByKey(lambda x, y: x + y)
total_regional_sales = regional_sales_rdd.reduceByKey(lambda x, y: x + y)

print(total_sales.collect())
print(total_regional_sales.collect())

# 5. **Task 4: Sort Products by Total Sales**
#    - Sort the products by their total sales in descending order.
#    - Print the sorted list of products along with their sales amounts.

sorted_sales = total_sales.sortBy(lambda x: x[1], ascending=False)
sorted_regional_sales = total_regional_sales.sortBy(lambda x: x[1], ascending=False)

print(sorted_sales.collect())
print(sorted_regional_sales.collect())

[('ProductA', [100, 200]), ('ProductB', [150, 250]), ('ProductC', [300, 100])]
[('ProductA', [50]), ('ProductC', [150])]
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]
[('ProductA', 50), ('ProductC', 150)]
[('ProductB', 400), ('ProductC', 400), ('ProductA', 300)]
[('ProductC', 150), ('ProductA', 50)]


In [31]:
# ### **Step 4: Additional Transformations**

# 6. **Task 5: Filter Products with High Sales**
#    - Filter the products that have total sales greater than 200.
#    - Print the products that meet this condition.

high_sales_products = total_sales.filter(lambda x: x[1] > 200)
high_regional_sales_products = total_regional_sales.filter(lambda x: x[1] > 200)

print(high_sales_products.collect())
print(high_regional_sales_products.collect())

# 7. **Task 6: Combine Regional Sales Data**
#    - Create another RDD from the `regional_sales_data` list.
#    - Combine this RDD with the original sales RDD using `union()`.
#    - Calculate the new total sales for each product after combining the datasets.
#    - Print the combined sales data.

combined_sales = sales_rdd.union(regional_sales_rdd)
combined_total_sales = combined_sales.reduceByKey(lambda x, y: x + y)

print(combined_sales.collect())
print(combined_total_sales.collect())


[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]
[]
[('ProductA', 100), ('ProductB', 150), ('ProductA', 200), ('ProductC', 300), ('ProductB', 250), ('ProductC', 100), ('ProductA', 50), ('ProductC', 150)]
[('ProductA', 350), ('ProductC', 550), ('ProductB', 400)]


In [32]:
# ### **Step 5: Perform Actions on the RDD**

# 8. **Task 7: Count the Number of Distinct Products**
#    - Count the number of distinct products in the RDD.
#    - Print the count of distinct products.

count_distinct_products = sales_rdd.map(lambda x: x[0]).distinct().count()
count_distinct_regional_products = regional_sales_rdd.map(lambda x: x[0]).distinct().count()

print(count_distinct_products)

# 9. **Task 8: Identify the Product with Maximum Sales**
#    - Find the product with the maximum total sales using `reduce()`.
#    - Print the product name and its total sales amount.

max_sales_product = total_sales.reduce(lambda x, y: x if x[1] > y[1] else y)
max_regional_sales_product = total_regional_sales.reduce(lambda x, y: x if x[1] > y[1] else y)

print(max_sales_product)
print(max_regional_sales_product)

3
('ProductC', 400)
('ProductC', 150)


In [35]:
# ### **Challenge Task: Calculate the Average Sales per Product**

# 10. **Challenge Task:**
#     - Calculate the average sales amount per product using the key-value pair RDD.
#     - Print the average sales for each product.

average_sales_per_product = total_sales.mapValues(lambda x: x / count_distinct_products)
average_regional_sales_per_product = total_regional_sales.mapValues(lambda x: x / count_distinct_regional_products)

print(average_sales_per_product.collect())
print(average_regional_sales_per_product.collect())

[('ProductA', 100.0), ('ProductB', 133.33333333333334), ('ProductC', 133.33333333333334)]
[('ProductA', 25.0), ('ProductC', 75.0)]
