In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext('local')

empList = ['1,kim,000-000-000','2,lee,111-111-111',
           '3,park,222-222-222','4,song,333-333-333',
           '5,han,555-555-555','6,yoon,666-666-666']

deptList = ['1,sale','2,developer','3,manager',
            '4,sale1','5,developer1','6,manager1']

# RDD로 반환.
empRdd  = sc.parallelize(empList)
deptRdd = sc.parallelize(deptList)

In [3]:
empRdd.collect()

['1,kim,000-000-000',
 '2,lee,111-111-111',
 '3,park,222-222-222',
 '4,song,333-333-333',
 '5,han,555-555-555',
 '6,yoon,666-666-666']

In [4]:
deptRdd.collect()

['1,sale', '2,developer', '3,manager', '4,sale1', '5,developer1', '6,manager1']

In [5]:
empRdd1 = empRdd.map(lambda x : x.split(','))
empRdd1.collect()

[['1', 'kim', '000-000-000'],
 ['2', 'lee', '111-111-111'],
 ['3', 'park', '222-222-222'],
 ['4', 'song', '333-333-333'],
 ['5', 'han', '555-555-555'],
 ['6', 'yoon', '666-666-666']]

In [6]:
empRdd2 = empRdd1.map(lambda x : (x[0], x[1] + '' + x[2]))
empRdd2.collect()

[('1', 'kim000-000-000'),
 ('2', 'lee111-111-111'),
 ('3', 'park222-222-222'),
 ('4', 'song333-333-333'),
 ('5', 'han555-555-555'),
 ('6', 'yoon666-666-666')]

In [7]:
deptRdd1 = deptRdd.map(lambda x : x.split(',')).map(lambda x : (x[0], x[1]))
deptRdd1.collect()

[('1', 'sale'),
 ('2', 'developer'),
 ('3', 'manager'),
 ('4', 'sale1'),
 ('5', 'developer1'),
 ('6', 'manager1')]

In [8]:
# join() : 데이터 결합.
joinData = empRdd2.join(deptRdd1)
joinData.collect()

[('1', ('kim000-000-000', 'sale')),
 ('4', ('song333-333-333', 'sale1')),
 ('2', ('lee111-111-111', 'developer')),
 ('3', ('park222-222-222', 'manager')),
 ('5', ('han555-555-555', 'developer1')),
 ('6', ('yoon666-666-666', 'manager1'))]

In [9]:
sorRdd = joinData.sortByKey()
sorRdd.collect()

[('1', ('kim000-000-000', 'sale')),
 ('2', ('lee111-111-111', 'developer')),
 ('3', ('park222-222-222', 'manager')),
 ('4', ('song333-333-333', 'sale1')),
 ('5', ('han555-555-555', 'developer1')),
 ('6', ('yoon666-666-666', 'manager1'))]

In [10]:
sorRdd.take(2)

[('1', ('kim000-000-000', 'sale')), ('2', ('lee111-111-111', 'developer'))]

In [11]:
sorRdd.first()

('1', ('kim000-000-000', 'sale'))

In [12]:
sorRdd.countByKey()

defaultdict(int, {'1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1})

In [13]:
weightRdd = sc.textFile('data1/weight.csv')
weightRdd.collect()

['year,height,weight,grade,gender,gradecode,gendercode',
 '2017,152.5,47.9,elementary,man,2,1',
 '2017,153.2,46.6,elementary,woman,2,0',
 '2017,170.6,63.8,middle,man,0,1',
 '2017,160.4,54.2,middle,woman,0,0',
 '2017,173.9,72.3,high,man,1,1',
 '2017,160.9,57.7,high,woman,1,0']

In [14]:
wRdd = weightRdd.map(lambda x : x.split(',')).filter( lambda x : x[0]=='2017')
wRdd.collect()

[['2017', '152.5', '47.9', 'elementary', 'man', '2', '1'],
 ['2017', '153.2', '46.6', 'elementary', 'woman', '2', '0'],
 ['2017', '170.6', '63.8', 'middle', 'man', '0', '1'],
 ['2017', '160.4', '54.2', 'middle', 'woman', '0', '0'],
 ['2017', '173.9', '72.3', 'high', 'man', '1', '1'],
 ['2017', '160.9', '57.7', 'high', 'woman', '1', '0']]

In [15]:
round(wRdd.map(lambda x : float(x[1])).sum(), 2)

971.5

In [16]:
round(wRdd.map(lambda x : float(x[1])).mean(), 2)

161.92

In [17]:
wRdd.map(lambda x :( x[3], 1)).reduceByKey(lambda x,y : x + y).collect()

[('elementary', 2), ('middle', 2), ('high', 2)]

In [18]:
wRdd.sortBy(lambda x : x[2], ascending=False).collect()

[['2017', '173.9', '72.3', 'high', 'man', '1', '1'],
 ['2017', '170.6', '63.8', 'middle', 'man', '0', '1'],
 ['2017', '160.9', '57.7', 'high', 'woman', '1', '0'],
 ['2017', '160.4', '54.2', 'middle', 'woman', '0', '0'],
 ['2017', '152.5', '47.9', 'elementary', 'man', '2', '1'],
 ['2017', '153.2', '46.6', 'elementary', 'woman', '2', '0']]

In [19]:
# Transform 함수. => filter() : 주어진 조건에 해당하는 데이터만 선별. 
wRdd.filter(lambda x : x[3]=='elementary').collect()

[['2017', '152.5', '47.9', 'elementary', 'man', '2', '1'],
 ['2017', '153.2', '46.6', 'elementary', 'woman', '2', '0']]

In [20]:
# Transform 함수. => filter() : 주어진 조건에 해당하는 데이터만 선별. 
wRdd.filter(lambda x : float(x[2]) >= 60 ).collect()

[['2017', '170.6', '63.8', 'middle', 'man', '0', '1'],
 ['2017', '173.9', '72.3', 'high', 'man', '1', '1']]

In [21]:
sc.stop()