## With low-level RDD API

In [None]:
#|output: false
from pyspark import SparkContext

sc = SparkContext('local')

In [None]:
dataRDD = sc.parallelize([('Brooke', 20), ('Denny', 31), ('Jules', 30), ('TD', 35), ('Brooke', 25)])

In [None]:
dataRDD.first()

('Brooke', 20)

In [None]:
dataRDD.collect() # only use when the dataset is small

[('Brooke', 20), ('Denny', 31), ('Jules', 30), ('TD', 35), ('Brooke', 25)]

In [None]:
mapedRDD = dataRDD.map(lambda x: (x[0], (x[1], 1)))
mapedRDD.collect()

[('Brooke', (20, 1)),
 ('Denny', (31, 1)),
 ('Jules', (30, 1)),
 ('TD', (35, 1)),
 ('Brooke', (25, 1))]

In [None]:
reducedRDD = mapedRDD.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
reducedRDD.collect()

[('Brooke', (45, 2)), ('Denny', (31, 1)), ('Jules', (30, 1)), ('TD', (35, 1))]

In [None]:
mapedRDD = reducedRDD.map(lambda x: (x[0], x[1][0]/x[1][1]))
mapedRDD.collect()

[('Brooke', 22.5), ('Denny', 31.0), ('Jules', 30.0), ('TD', 35.0)]

In [None]:
#|output: false
sc.stop()

## With high-level DSL operates and DataFrame API

In [None]:
#|output: false
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder.appName('AvgAges').getOrCreate()

In [None]:
data_df = spark.createDataFrame([('Brooke', 20), ('Denny', 31), ('Jules', 30), ('TD', 35), ('Brooke', 25)], 
                                ['name', 'age'])
data_df.show()

+------+---+
|  name|age|
+------+---+
|Brooke| 20|
| Denny| 31|
| Jules| 30|
|    TD| 35|
|Brooke| 25|
+------+---+



In [None]:
avg_df = data_df.groupBy('name').agg(avg('age'))
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



:::{.callout-note}
            
This version is far more expressive and simpler than the previous one.

:::

In [None]:
#|output: false
spark.stop()