In [1]:
import pyspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location

from pyspark import SparkContext
sc = SparkContext()

# 35

In [2]:
pairs = sc.parallelize([('a', 1), ('a', 2), ('b', 3), ('c', 4)], numSlices=4)

In [3]:
pairs.groupByKey().collect()

[('b', <pyspark.resultiterable.ResultIterable at 0x7fa151f06940>),
 ('c', <pyspark.resultiterable.ResultIterable at 0x7fa151f06a20>),
 ('a', <pyspark.resultiterable.ResultIterable at 0x7fa151f06a90>)]

In [7]:
pairs.groupByKey().collect()[2]

('a', <pyspark.resultiterable.ResultIterable at 0x7fa1518f9128>)

In [8]:
pairs.groupByKey().collect()[2][1]

<pyspark.resultiterable.ResultIterable at 0x7fa151935080>

In [9]:
[x for x in pairs.groupByKey().collect()[2][1]]

[1, 2]

In [10]:
# reduce or aggregateByKey if make any calculations on elements (if possible) -> speed up

groupByKey uses reduce under the hood -> data can be migrating between executors -> can be slow

# 36

In [11]:
states = sc.parallelize(['TX', 'TX', 'CA', 'TX', 'CA'])

In [13]:
import operator

In [14]:
states.map(lambda x: (x,1)).reduceByKey(operator.add).collect()

[('TX', 3), ('CA', 2)]

Function passed to reduce operator should be commutative and associative. If you struggle to come up with such function, you can use aggregateByKey, which can take two functions, so that you can prepare values in one function and use them in another.

# 37

Three parameters are required for aggregateByKey:

    1) Zero value -> value with which second argument starts with
    2) Responsible for adding values from RDD to our zero value
    3) Combines results from different partitions

In [2]:
# (1)
zero_value = set() # initializing a set

In [3]:
# (2)
def seq_op(x,y): # adding element to a set
    x.add(y) # Spark explicitly allows you to modify the first argumet (which otherwise is considered a bad 
             # programming style. This prevents from reallocating memory for all your collection every time we add
             # a value to it)
    return x

In [4]:
# (3)
def comb_op(x,y): # can combine any sets
    return x.union(y)

In [5]:
numbers = sc.parallelize([0,0,1,2,5,4,5,5,5]).map(lambda x: ['even' if (x % 2 == 0) else 'odd', x])

In [6]:
numbers.collect()

[['even', 0],
 ['even', 0],
 ['odd', 1],
 ['even', 2],
 ['odd', 5],
 ['even', 4],
 ['odd', 5],
 ['odd', 5],
 ['odd', 5]]

In [7]:
numbers.aggregateByKey(zero_value, seq_op, comb_op).collect()

[('even', {0, 2, 4}), ('odd', {1, 5})]

Another useful example might be calculating averages. 

We initialize with an emply pair, and adding values to the first element (-> sum) and their count to the second element.  

# 38

In [8]:
pairs = sc.parallelize([('B', 1), ('a', 2), ['A', 3], ('d', 4)])

In [9]:
pairs.sortByKey().collect()

[('A', 3), ('B', 1), ('a', 2), ('d', 4)]

In [10]:
pairs.sortByKey(ascending=False).collect()

[('d', 4), ('a', 2), ('B', 1), ('A', 3)]

In [11]:
pairs.sortByKey(numPartitions=1).glom().collect()

[[['A', 3], ('B', 1), ('a', 2), ('d', 4)]]

In [12]:
pairs.sortByKey(numPartitions=3).glom().collect()

[[('A', 3), ('B', 1)], [('a', 2)], [('d', 4)]]

In [13]:
pairs.sortByKey(keyfunc=lambda x: x.lower()).collect()
# each key was passed through the keyfunc and then sorted. 
# It doesn't change the keys, but it use modified keys for sorting.

[('a', 2), ('A', 3), ('B', 1), ('d', 4)]

# 39

Join operates on two pairs or key-value RDDs

In [14]:
a = sc.parallelize([(1, 'a'), (2, 'a')])

In [15]:
b = sc.parallelize([(2, 'b'), (3, 'b')])

In [16]:
a.join(b).collect()

[(2, ('a', 'b'))]

In [17]:
c = sc.parallelize([(2, 'b'), (3, 'b'), (2, 'c')])

In [18]:
a.join(c).collect()
# we get an output for each of the matches -> all possible combinations

[(2, ('a', 'b')), (2, ('a', 'c'))]

In [20]:
a.leftOuterJoin(b).collect()

[(1, ('a', None)), (2, ('a', 'b'))]

In [24]:
a.rightOuterJoin(b).collect()

[(2, ('a', 'b')), (3, (None, 'b'))]

In [23]:
a.fullOuterJoin(b).collect()

[(1, ('a', None)), (2, ('a', 'b')), (3, (None, 'b'))]

In [25]:
# Whenever we make outer joins we need to ensure that we are handling None values

In [26]:
# All of the Join operations take an optional second argument that specifies number of partitions

When making a Join we potentially move a lot of data across the clusters which can slow down execution. 

Try to see if you can make fewer joins, or whether it is possible to filter down the data prior to join

# 40 

CoGroup allows you to combine two RDDs. It differs from join by the way of handling repeated keys.

In [32]:
a = sc.parallelize([(1, 'a'), (2, 'a')])

In [33]:
b = sc.parallelize([(2, 'b'), (2, 'c'), (3, 'd')])

In [34]:
a.join(b).collect()

[(2, ('a', 'b')), (2, ('a', 'c'))]

In [35]:
a.cogroup(b).collect()
# this is great for efficiency, but problematic for printing stuff

[(1,
  (<pyspark.resultiterable.ResultIterable at 0x7f43f51c9898>,
   <pyspark.resultiterable.ResultIterable at 0x7f43f51c9780>)),
 (2,
  (<pyspark.resultiterable.ResultIterable at 0x7f43f51c97f0>,
   <pyspark.resultiterable.ResultIterable at 0x7f43f51c96d8>)),
 (3,
  (<pyspark.resultiterable.ResultIterable at 0x7f43f51c97b8>,
   <pyspark.resultiterable.ResultIterable at 0x7f43f51c9630>))]

In [38]:
a.cogroup(b).mapValues(lambda x: [x[0], x[1]]).collect()

[(1,
  [<pyspark.resultiterable.ResultIterable at 0x7f43f4b18668>,
   <pyspark.resultiterable.ResultIterable at 0x7f43f4b184e0>]),
 (2,
  [<pyspark.resultiterable.ResultIterable at 0x7f43f4b18b70>,
   <pyspark.resultiterable.ResultIterable at 0x7f43f4b18a20>]),
 (3,
  [<pyspark.resultiterable.ResultIterable at 0x7f43f4b185c0>,
   <pyspark.resultiterable.ResultIterable at 0x7f43f4b18400>])]

In [36]:
a.cogroup(b).mapValues(lambda x: [list(x[0]), list(x[1])]).collect()

[(1, [['a'], []]), (2, [['a'], ['b', 'c']]), (3, [[], ['d']])]