# FRIENDS BY AGE Example
- Input Data: ID, name, age, number of friends
e.g.


- 0,Will,33,385
- 1,Jean-Luc,33,2
- 3,Deanna,40,465
- 4,Quark,68,21

In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setMaster('local').setAppName('FriendsByAge')
sc = SparkContext(conf=conf)

In [6]:
def parse_line(line):
    fields = line.split(',')
    age, num_friends = int(fields[2]), int(fields[3])
    return (age, num_friends)

In [7]:
# load csv file, create RDD Object
lines = sc.textFile('fakefriends.csv')

# broadcast method to RDD
rdd = lines.map(parse_line)

# get the total number of friends and entries by age
totals_by_age = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x,y: (x[0] + y[0], x[1] + y[1]))

# get Average number of friends by age
averages_by_age = totals_by_age.mapValues(lambda x: x[0]/x[1])

# Take Actions, collect the final result
results = averages_by_age.collect()

In [8]:
for num, result in enumerate(results):
    print(num, " ", result)

0   (33, 325.3333333333333)
1   (26, 242.05882352941177)
2   (55, 295.53846153846155)
3   (40, 250.8235294117647)
4   (68, 269.6)
5   (59, 220.0)
6   (37, 249.33333333333334)
7   (54, 278.0769230769231)
8   (38, 193.53333333333333)
9   (27, 228.125)
10   (53, 222.85714285714286)
11   (57, 258.8333333333333)
12   (56, 306.6666666666667)
13   (43, 230.57142857142858)
14   (36, 246.6)
15   (22, 206.42857142857142)
16   (35, 211.625)
17   (45, 309.53846153846155)
18   (60, 202.71428571428572)
19   (67, 214.625)
20   (19, 213.27272727272728)
21   (30, 235.8181818181818)
22   (51, 302.14285714285717)
23   (25, 197.45454545454547)
24   (21, 350.875)
25   (42, 303.5)
26   (49, 184.66666666666666)
27   (48, 281.4)
28   (50, 254.6)
29   (39, 169.28571428571428)
30   (32, 207.9090909090909)
31   (58, 116.54545454545455)
32   (64, 281.3333333333333)
33   (31, 267.25)
34   (52, 340.6363636363636)
35   (24, 233.8)
36   (20, 165.0)
37   (62, 220.76923076923077)
38   (41, 268.55555555555554)
39   (44,