## PySpark Demo - (In Jupyter Notebook)

In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

rdd = sc.parallelize([1,2,3,4])
rdd.collect()

[1, 2, 3, 4]

## PySpark Demo - (In Zepplin)

In [None]:
%pyspark

rdd = sc.parallelize([1,2,3,4])
rdd.collect()

## Iterative Programming

In [4]:
# loading data
a = [1,2,3,4,5,6,7,8]

# get even number
b = []
for ele in a:
    if ele % 2 == 0: 
        b.append(ele)
        
# get sum of even number
c = sum(b)
c

20

## Functional Programming

In [9]:
import numpy as np
a = np.array([1,2,3,4,5,6,7,8])

def f(ele):
    return ele % 2 == 0

sum(a[f(a)])


'/home/jovyan'

## PySpark 語法

In [10]:
rdd = sc.parallelize([1,2,3,4,5], 4)
rdd

ParallelCollectionRDD[1] at parallelize at PythonRDD.scala:175

In [11]:
rdd.collect()

[1, 2, 3, 4, 5]

## Get Data From File

### Zeppelin

In [None]:
%pyspark 
lines = sc.textFile('file:/tmp/trump.txt') 
lines.take(3)

### Jupyter Notebook

In [12]:
lines = sc.textFile('trump.txt')
lines.take(3)

['Chief Justice Roberts, President Carter, President Clinton, President Bush, fellow Americans and people of the world – thank you.',
 'We the citizens of America have now joined a great national effort to rebuild our county and restore its promise for all our people.',
 '']

## Python Lambda

In [14]:
def addNum(a, b):
    return a + b

addNum(2,3)

addNum2 = lambda a, b : a + b
addNum2(3,4)

7

In [17]:
exp = lambda e: e**2
exp(4)

16

## PySpark Transformation

In [18]:
rdd = sc.parallelize([1, 2, 3, 4]) 
a   = rdd.map(lambda x: x * 2)
a

PythonRDD[6] at RDD at PythonRDD.scala:48

In [19]:
a.collect()

[2, 4, 6, 8]

In [24]:
rdd = sc.parallelize([1, 2, 3, 4]) 
a   = rdd.map(lambda x: x % 2 == 0)
a.collect()

[False, True, False, True]

In [25]:
a   = rdd.filter(lambda x: x % 2 == 0)
a.collect()

[2, 4]

In [23]:
import numpy as np
a = np.array([1,2,3,4])
a[a % 2 == 0] 

array([2, 4])

In [26]:
rdd = sc.parallelize([1, 4, 2,2,3]) 
a   = rdd.distinct()
a.collect()

[4, 1, 2, 3]

In [29]:
rdd=sc.parallelize([1,2,3])
a = rdd.map(lambda x:[x,x+5])
a.collect()

[[1, 6], [2, 7], [3, 8]]

In [30]:
rdd=sc.parallelize([1,2,3])
a = rdd.flatMap(lambda x:[x,x+5])
a.collect()

[1, 6, 2, 7, 3, 8]

## Spark Action

In [31]:
rdd=sc.parallelize([1,2,3]) 
rdd.reduce(lambda a,b:a*b)

#  1   2   3
#    2
#       6

6

In [32]:
rdd.take(2)

[1, 2]

In [33]:
rdd.collect()

[1, 2, 3]

In [34]:
rdd=sc.parallelize([5,3,1,2]) 
rdd.takeOrdered(3,lambda s:-1*s)

[5, 3, 2]

## Key-Value Pair

In [36]:
rdd = sc.parallelize([(1,2), (3,4), (3,6)]) 
a = rdd.reduceByKey(lambda a, b: a + b) 
a.collect()
#RDD: [(1,2), (3,4), (3,6)] → [(1,2), (3,10)]


[(1, 2), (3, 10)]

In [40]:
rdd2 = sc.parallelize([(1,'a'), (2,'c'), (1,'b')]) 
a = rdd2.sortByKey()
a.collect()
#RDD: [(1,'a'), (2,'c'), (1,'b')] → [(1,'a'), (1,'b'), (2,'c')]

[(1, 'a'), (1, 'b'), (2, 'c')]

In [44]:
rdd2 = sc.parallelize([(1,'a'), (2,'c'), (1,'b')]) 
rdd2.groupByKey()
a.collect()

[(1, <pyspark.resultiterable.ResultIterable at 0x7f3ae4e73390>),
 (2, <pyspark.resultiterable.ResultIterable at 0x7f3ae4e73518>)]

## Broadcast

In [45]:
broadcastVar = sc.broadcast([1, 2, 3])

In [47]:
broadcastVar.value

[1, 2, 3]

In [48]:
accum = sc.accumulator(0) 
rdd = sc.parallelize([1, 2, 3, 4])

def f(x):
    global accum 
    accum += x

rdd.foreach(f) 
accum.value

10

## 電影分析

In [50]:
lines = sc.textFile("u.data") 
lines.take(5)

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596']

In [53]:
movies= lines.map(lambda x : (int(x.split()[1]) , 1) ) 
movies.take(3)

[(242, 1), (302, 1), (377, 1)]

In [54]:
movieCounts = movies.reduceByKey(lambda x,y: x+ y)

In [55]:
movieCounts.take(3)

[(242, 117), (302, 297), (346, 126)]

In [56]:
res = movieCounts.sortBy(lambda a: -a[1])

In [57]:
res.take(5)

[(50, 583), (258, 509), (100, 508), (181, 507), (294, 485)]

In [75]:
def loadMovieNames():
    movieNames = {}
    with open('u.item', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
            #print(fields[0], fields[1])
            #break
    return movieNames

In [76]:
nameDict = sc.broadcast(loadMovieNames())

In [77]:
res = movieCounts.sortBy(lambda a: -a[1])
res2 = res.map(lambda e: (nameDict.value.get(e[0]), e[1]))
res2.take(10)

[('Star Wars (1977)', 583),
 ('Contact (1997)', 509),
 ('Fargo (1996)', 508),
 ('Return of the Jedi (1983)', 507),
 ('Liar Liar (1997)', 485),
 ('English Patient, The (1996)', 481),
 ('Scream (1996)', 478),
 ('Toy Story (1995)', 452),
 ('Air Force One (1997)', 431),
 ('Independence Day (ID4) (1996)', 429)]