In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window as W

In [2]:
from pyspark.sql.types import DateType,StringType

In [50]:
from pyspark.context import SparkContext

In [3]:
spark = SparkSession.builder.appName("epam").getOrCreate()

In [51]:
sc = SparkContext.getOrCreate()

**GroupByKey**:<br>
groupByKey() operates on Pair RDDs and is used to group all the values related to a given key.<br>
groupByKey() always results in Hash-Partitioned RDDs.

**ReduceByKey**:<br>
ReduceByKey(function) - When called on a dataset of (K, V) pairs, returns a dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function

**USe Case**:<br>
**GroupByKey:** is typically used when we need to group data by key and process all the values associated with each key together. It is common in scenarios like word count or grouping data for further analysis.<br>
**ReduceByKey:** is used when we need to perform aggregations or computations on grouped values based on their keys. It is suitable for scenarios like calculating sum, average, maximum, or minimum values for each key.

In [56]:
rdd = sc.parallelize([(1, 'apple'), (2, 'banana'), (1, 'cherry')])

grouped_rdd = rdd.groupByKey()

# Result: [(1, ['apple', 'cherry']), (2, ['banana'])]
grouped_rdd.collect()

[(1, <pyspark.resultiterable.ResultIterable at 0x1f840e641f0>),
 (2, <pyspark.resultiterable.ResultIterable at 0x1f840e64340>)]

In [57]:
rdd = sc.parallelize([(1, 2), (2, 3), (1, 4)])
summed_rdd = rdd.reduceByKey(lambda x, y: x + y)

# Result: [(1, 6), (2, 3)]
summed_rdd.collect()

[(1, 6), (2, 3)]

### Map VS FlatMap

In [72]:
array1d = sc.parallelize (("1,2,3", "4,5,6", "7,8,9"))  
x =array1d.map(lambda x: x.split(","))
x.collect()

[['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']]

In [73]:
array_flat = sc.parallelize (("1,2,3", "4,5,6", "7,8,9"))  
flat_rdd =array1d.flatMap(lambda x: x.split(","))
flat_rdd.collect()

['1', '2', '3', '4', '5', '6', '7', '8', '9']