## Simple example of aggregation

## findspark

In [1]:
import findspark 
findspark.init() 
print("Done")

Done


## import

In [69]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

## RDD Aggregation Example

### Create SparkContext instance

In [3]:
sc = SparkContext('local', 'App')

### Create an RDD of tuples (name, age)

In [6]:
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25)])
dataRDD.collect()

[('Brooke', 20), ('Denny', 31), ('Jules', 30), ('TD', 35), ('Brooke', 25)]

### Aggregate and Compute average

In [8]:
mapRDD = dataRDD.map(lambda x: (x[0], (x[1], 1)))
mapRDD.collect()

[('Brooke', (20, 1)),
 ('Denny', (31, 1)),
 ('Jules', (30, 1)),
 ('TD', (35, 1)),
 ('Brooke', (25, 1))]

In [19]:
redRDD = mapRDD.reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
redRDD.collect()

[('Brooke', (45, 2)), ('Denny', (31, 1)), ('Jules', (30, 1)), ('TD', (35, 1))]

In [21]:
avgRDD = redRDD.map(lambda x: (x[0], x[1][0]/x[1][1]))
avgRDD.collect()

[('Brooke', 22.5), ('Denny', 31.0), ('Jules', 30.0), ('TD', 35.0)]

In [88]:
agesRDD = (dataRDD
           .map(lambda x: (x[0], (x[1], 1)))
           .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1]))
           .map(lambda x:(x[0], x[1][0]/x[1][1]))
           .sortBy(lambda kv: kv[1], ascending = False)
           
          )
agesRDD.collect()

[('TD', 35.0), ('Denny', 31.0), ('Jules', 30.0), ('Brooke', 22.5)]

In [None]:
dataRDD.sort

In [37]:
type(agesRDD)

pyspark.rdd.PipelinedRDD

## DataFrame Aggregation Example

### Create SparkSession instance

In [40]:
spark = SparkSession.builder.appName('Avg').getOrCreate()

### Create a DataFraom

In [68]:
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30),("TD", 35), ("Brooke", 25)], ['name','age'])
data_df.show()

+------+---+
|  name|age|
+------+---+
|Brooke| 20|
| Denny| 31|
| Jules| 30|
|    TD| 35|
|Brooke| 25|
+------+---+



### Aggregate and Compute average

In [76]:
avg_df = data_df.groupBy('name').agg(F.avg('age').alias('avg_age')).sort(F.desc('avg_age'))
avg_df.show()

+------+-------+
|  name|avg_age|
+------+-------+
|    TD|   35.0|
| Denny|   31.0|
| Jules|   30.0|
|Brooke|   22.5|
+------+-------+

