In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("examples_rdd")
sc = SparkContext(conf=conf)
sc

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 17:45:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## RDD 실습 1. 영화 별점 카운트
- u.data 를 열어서 영화의 별점 별로 몇개가 등록 되어있는지를 카운팅하기. 
- 별점은 인덱스 2번에 들어 있음.
- {1 : 몇개, 2 : 몇개, 3 : 몇개, ...}

In [2]:
filepath = "/home/ubuntu/working/spark-examples/data/u.data"

In [3]:
lines = sc.textFile(f"file:///{filepath}")
lines.collect()[:3]

                                                                                

['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']

In [4]:
stars = lines.map(lambda x : int(x.split()[2]))
stars.collect()[:5]

                                                                                

[3, 3, 1, 2, 1]

In [5]:
result = stars.countByValue()
result

defaultdict(int, {3: 27145, 1: 6110, 2: 11370, 4: 34174, 5: 21201})

In [6]:
from collections import OrderedDict

sortedResult = OrderedDict(sorted(result.items()))
sortedResult

OrderedDict([(1, 6110), (2, 11370), (3, 27145), (4, 34174), (5, 21201)])

## RDD 실습 2. 나이대 별로 몇 명의 사람이 있는지 카운팅
- fakefriends.csv를 열어서 각 사람의 나이대를 구한 다음, 
- 해당 나이대에 해당하는 사람들이 몇 명이 있는 지를 구하기 . 
- 나이는 인덱스 2번에 들어 있음
- {30 : 몇명, 20: 몇명, ...}

In [7]:
filepath = "/home/ubuntu/working/spark-examples/data/fakefriends.csv"

datas = sc.textFile(f"file:///{filepath}")
datas.collect()[:5]

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [8]:
def parse(row):
    fields = row.split(",")
    age = int(fields[2])
    age_grade = int(age/10)*10
    
    return age_grade

In [9]:
rdd = datas.map(parse)
rdd.collect()[:5]

[30, 20, 50, 40, 60]

In [10]:
rdd.countByValue()

defaultdict(int, {30: 97, 20: 93, 50: 94, 40: 102, 60: 95, 10: 19})

## RDD 실습 3. 역사상 최저 온도 찾기
- 1800.csv파일을 열고 각 STATION 별 최저 온도 값을 계산.
- 0 번째 값 : STATION 번호
- 1 번째 값 : 날짜
- 2 번째 값 : 최고 온도, 최저 온도 여부 (TMAX : 최고 온도, TMIN 최저 온도)
- 3 번째 값 : 온도

In [12]:
filepath = "/home/ubuntu/working/spark-examples/data/1800.csv"
datas = sc.textFile(f"file:///{filepath}")
datas.collect()[:5]

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,',
 'EZE00100082,18000101,TMAX,-86,,,E,',
 'EZE00100082,18000101,TMIN,-135,,,E,']

In [13]:
mintems = datas.filter(lambda x: 'TMIN' in x)
mintems.collect()[:5]

['ITE00100554,18000101,TMIN,-148,,,E,',
 'EZE00100082,18000101,TMIN,-135,,,E,',
 'ITE00100554,18000102,TMIN,-125,,,E,',
 'EZE00100082,18000102,TMIN,-130,,,E,',
 'ITE00100554,18000103,TMIN,-46,,I,E,']

In [16]:
def parse(row):
    fields = row.split(",")
    station_id = fields[0]
    temperature = int(fields[3])
    
    return station_id, temperature

In [17]:
parsed_row = mintems.map(parse)
parsed_row.collect()[:5]

[('ITE00100554', -148),
 ('EZE00100082', -135),
 ('ITE00100554', -125),
 ('EZE00100082', -130),
 ('ITE00100554', -46)]

In [19]:
parsed_row.reduceByKey(lambda x, y : min(x, y)).collect()

[('ITE00100554', -148), ('EZE00100082', -135)]

## RDD 실습 4. 책에 나온 단어 개수 세기 (wordcount)
- BOOK 파일을 열고 split()을 이용해 단어의 개수를 세어주면 된다.
- { 'apple' : 10, 'banana' : 20, ...}

In [20]:
filepath = "/home/ubuntu/working/spark-examples/data/Book"

lines = sc.textFile(f"file:///{filepath}")
lines.collect()[:5]

['Self-Employment: Building an Internet Business of One',
 'Achieving Financial and Personal Freedom through a Lifestyle Technology Business',
 'By Frank Kane',
 '',
 '']

In [21]:
words = lines.flatMap(lambda x : x.split())
words.collect()[:10]

['Self-Employment:',
 'Building',
 'an',
 'Internet',
 'Business',
 'of',
 'One',
 'Achieving',
 'Financial',
 'and']

In [22]:
words.countByValue()

defaultdict(int,
            {'Self-Employment:': 1,
             'Building': 5,
             'an': 172,
             'Internet': 13,
             'Business': 19,
             'of': 941,
             'One': 12,
             'Achieving': 1,
             'Financial': 3,
             'and': 901,
             'Personal': 3,
             'Freedom': 7,
             'through': 55,
             'a': 1148,
             'Lifestyle': 5,
             'Technology': 2,
             'By': 9,
             'Frank': 10,
             'Kane': 7,
             'Copyright': 1,
             '�': 174,
             '2015': 3,
             'Kane.': 1,
             'All': 13,
             'rights': 3,
             'reserved': 2,
             'worldwide.': 2,
             'CONTENTS': 1,
             'Disclaimer': 1,
             'Preface': 1,
             'Part': 2,
             'I:': 2,
             'Making': 5,
             'the': 1176,
             'Big': 1,
             'Decision': 1,
             'Overcoming'

In [23]:
sc.stop()