In [1]:
import pyspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location

In [2]:
from pyspark import SparkContext
sc = SparkContext()

# 25

Reduce takes function that is: 

-associative (order of operation doesn't matter -> addition; substraction is NOT associative) -> paranthesis change

-commutative (order of arguments doesn't matter)

In [3]:
numbers = sc.parallelize(range(10), numSlices=3)

In [5]:
numbers.glom().collect()

[[0, 1, 2], [3, 4, 5], [6, 7, 8, 9]]

In [6]:
numbers.reduce(max)
# it probably found max of each partition: 2, 5 and 9 (-> parallelism) and then found the global max

9

In [7]:
numbers.reduce(lambda x,y: x+y)
# generating aggregates

45

In [8]:
# check reduceByKey

# 26

Collect allows you to pull all the data from RDD into the driver program as list

In [9]:
numbers = sc.parallelize(range(10))

In [11]:
numbers.collect() # can be unmanageable when RDD stores huge amount of data

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [12]:
huge_rdd = sc.parallelize(range(100000))

In [15]:
huge_rdd.sample(withReplacement=False, fraction=0.0001, seed=1).collect()

[10539, 20963, 21542, 24138, 30744, 45150, 51789, 55059]

# 27

In [18]:
numbers = sc.parallelize(range(100000), numSlices=10)

In [19]:
numbers.count()

100000

In [20]:
numbers.countApprox(timeout=200, confidence=0.9)
# approximate counting, can be not 100% precise, but it is much faster

100000

# 28

In [21]:
numbers = sc.parallelize(range(10))

In [22]:
numbers.first()

0

In [23]:
sc.parallelize([3,2,1]).sortBy(lambda x: x).first()
# here we've used identity key for sorting

1

# 29

In [24]:
numbers = sc.parallelize(range(10))

In [25]:
numbers.take(3)

[0, 1, 2]

In [26]:
numbers.take(1) # list, unlike first (element)

[0]

# 30

In [31]:
numbers = sc.parallelize(range(10))

In [32]:
numbers.takeSample(withReplacement=False, num=3, seed=1)

[6, 8, 9]

In [35]:
numbers.takeSample(withReplacement=False, num=20, seed=1)

[6, 8, 9, 7, 5, 3, 0, 4, 1, 2]

In [36]:
numbers.takeSample(withReplacement=True, num=20, seed=1)

[4, 9, 3, 1, 0, 0, 6, 7, 4, 6, 8, 9, 3, 2, 0, 2, 3, 2, 5, 8]

# 31

In [38]:
numbers = sc.parallelize([1,3,2,5,1,2,8])

In [39]:
numbers.takeOrdered(3)

[1, 1, 2]

In [40]:
numbers.takeOrdered(30)

[1, 1, 2, 2, 3, 5, 8]

In [41]:
numbers.takeOrdered(30, key=lambda x: -x)

[8, 5, 3, 2, 2, 1, 1]

# 32

saveAsTextFile

It is common to use repartition before saving to control number of output files.

In [42]:
numbers = sc.parallelize(range(1000), numSlices=5)

In [43]:
numbers.saveAsTextFile('Working Files/05/32_output.txt')

If _SUCCESS file is present in the output directory it means that we wrote everything.

In [44]:
numbers.saveAsTextFile('Working Files/05/32_output.gz',
                      compressionCodecClass='org.apache.hadoop.io.compress.GzipCodec')
# this makes output smaller

# 33

In [3]:
pairs = sc.parallelize([('a', 1), ('b', 2), ('b', 3)])

In [4]:
pairs.countByKey() # this completely ignores the values, just counts occurences

defaultdict(int, {'a': 1, 'b': 2})

# 34

In [45]:
def add_to_queue(x, queue=[]):
    queue += [x]
    return queue

In [46]:
add_to_queue(3)

[3]

In [47]:
add_to_queue(4)

[3, 4]

In [40]:
numbers = sc.parallelize(range(100))

In [48]:
numbers.foreach(add_to_queue)

In [49]:
add_to_queue(5)

[3, 4, 5]

?where is 0-99?

Ans: the queue that we see is from the driver's program, but numbers.foreach(add_to_queue) was executed in the external executors.

Each of the executors got a copy of add_to_queue function/ However once they've finished adding to it, they got discarded. Spark by default doesn't push changes back to the driver's program.

To combine data from the executors we can either use a `reduce` or and `accumulator`

In [50]:
accum = sc.accumulator(0)
def add_to_queue_accum(x, acc=accum):
    acc.add(x)
    return acc

In [51]:
numbers.foreach(add_to_queue_accum)

In [52]:
accum.value

4950

In [18]:
broadc = sc.broadcast([1, 2, 3])

In [22]:
broadc

AttributeError: 'Broadcast' object has no attribute 'append'