In [3]:
import pyspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location

In [4]:
from pyspark import SparkContext
sc = SparkContext()

# 11

In [5]:
numbers = sc.parallelize(range(10))

In [6]:
numbers.map(lambda x: x * 10).collect()

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [7]:
# Another approach
def times_ten(x):
    # It shouldn't use variables, state from outside the function
    # because when running in parallel those values can change and not
    # all the executors will use the same values
    return x * 10

numbers.map(times_ten).collect()

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

map has preservePartitioning argument, that can speed up joins

mapPartitions is a way for you to operate on a whole partition at once, which is useful if you want 
to amortize a certain cost across the elements (e.g. you open a database connection 
and test each of them against the database). 

If you just want to see each element once and don't care about sharing stuff across them, use map()

mapPartitions is a way for you to operate on a whole partition at once, which is useful if you want to amortize a certain cost across the elements (e.g. you open a database connection and test each of them against the database). If you just want to see each element once and don't care about sharing stuff across them, use map()

# 12

filter -> sql WHERE clause

In [8]:
numbers = sc.parallelize(range(10))

In [12]:
numbers.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [27]:
def is_even(x):
    return (x % 2) == 0

In [28]:
numbers.filter(is_even).collect()

[0, 2, 4, 6, 8]

calling coallecse after filtering can be friuitful: reduces the number of partitions for your data and minimizes network data and reduce overhead time:

numbers = numbers.coalesce(numPartitions=10)

# 13

flatMap: one -> many; map: one -> one

In [32]:
text = sc.textFile('Working Files/04/Audio Standardization Sentences.txt')

In [33]:
text.collect()

['Oak is strong and also gives shade.',
 'Cats and dogs each hate the other.',
 'The pipe began to rust while new.',
 "Open the crate but don't break the glass.",
 'Add the sum to the product of these three.',
 'Thieves who rob friends deserve jail.',
 'The ripe taste of cheese improves with age.',
 'Act on these orders with great speed.',
 'The hog crawled under the high fence.',
 'Move the vat over the hot fire.']

In [35]:
words = text.flatMap(lambda x: x.split(' ')) # one list for all of the sentences

In [36]:
words.collect()

['Oak',
 'is',
 'strong',
 'and',
 'also',
 'gives',
 'shade.',
 'Cats',
 'and',
 'dogs',
 'each',
 'hate',
 'the',
 'other.',
 'The',
 'pipe',
 'began',
 'to',
 'rust',
 'while',
 'new.',
 'Open',
 'the',
 'crate',
 'but',
 "don't",
 'break',
 'the',
 'glass.',
 'Add',
 'the',
 'sum',
 'to',
 'the',
 'product',
 'of',
 'these',
 'three.',
 'Thieves',
 'who',
 'rob',
 'friends',
 'deserve',
 'jail.',
 'The',
 'ripe',
 'taste',
 'of',
 'cheese',
 'improves',
 'with',
 'age.',
 'Act',
 'on',
 'these',
 'orders',
 'with',
 'great',
 'speed.',
 'The',
 'hog',
 'crawled',
 'under',
 'the',
 'high',
 'fence.',
 'Move',
 'the',
 'vat',
 'over',
 'the',
 'hot',
 'fire.']

In [38]:
words.count()

73

In [37]:
text.map(lambda x: x.split(' ')).collect() # one list for each sentence

[['Oak', 'is', 'strong', 'and', 'also', 'gives', 'shade.'],
 ['Cats', 'and', 'dogs', 'each', 'hate', 'the', 'other.'],
 ['The', 'pipe', 'began', 'to', 'rust', 'while', 'new.'],
 ['Open', 'the', 'crate', 'but', "don't", 'break', 'the', 'glass.'],
 ['Add', 'the', 'sum', 'to', 'the', 'product', 'of', 'these', 'three.'],
 ['Thieves', 'who', 'rob', 'friends', 'deserve', 'jail.'],
 ['The', 'ripe', 'taste', 'of', 'cheese', 'improves', 'with', 'age.'],
 ['Act', 'on', 'these', 'orders', 'with', 'great', 'speed.'],
 ['The', 'hog', 'crawled', 'under', 'the', 'high', 'fence.'],
 ['Move', 'the', 'vat', 'over', 'the', 'hot', 'fire.']]

# 14

Partition - chunk of data in RDD

In [39]:
text = sc.textFile('Working Files/04/Audio Standardization Sentences.txt', minPartitions=5)

In [40]:
words = text.flatMap(lambda x: x.split(' '))

In [47]:
def count_words(iterator):
    counts = {}
    for w in iterator:
        if w in counts:
            counts[w] += 1
        else:
            counts[w] = 1
    yield counts

The yield statement suspends function’s execution and sends a value back to caller, but retains enough state to enable function to resume where it is left off. When resumed, the function continues execution immediately after the last yield run. This allows its code to produce a series of values over time, rather them computing them at once and sending them back like a list.

In [48]:
word_counts = words.mapPartitions(count_words)

In [49]:
word_counts.collect()

[{'Oak': 1,
  'is': 1,
  'strong': 1,
  'and': 2,
  'also': 1,
  'gives': 1,
  'shade.': 1,
  'Cats': 1,
  'dogs': 1,
  'each': 1,
  'hate': 1,
  'the': 1,
  'other.': 1,
  'The': 1,
  'pipe': 1,
  'began': 1,
  'to': 1,
  'rust': 1,
  'while': 1,
  'new.': 1},
 {'Open': 1,
  'the': 4,
  'crate': 1,
  'but': 1,
  "don't": 1,
  'break': 1,
  'glass.': 1,
  'Add': 1,
  'sum': 1,
  'to': 1,
  'product': 1,
  'of': 1,
  'these': 1,
  'three.': 1},
 {'Thieves': 1,
  'who': 1,
  'rob': 1,
  'friends': 1,
  'deserve': 1,
  'jail.': 1,
  'The': 1,
  'ripe': 1,
  'taste': 1,
  'of': 1,
  'cheese': 1,
  'improves': 1,
  'with': 1,
  'age.': 1},
 {'Act': 1,
  'on': 1,
  'these': 1,
  'orders': 1,
  'with': 1,
  'great': 1,
  'speed.': 1},
 {'The': 1,
  'hog': 1,
  'crawled': 1,
  'under': 1,
  'the': 3,
  'high': 1,
  'fence.': 1,
  'Move': 1,
  'vat': 1,
  'over': 1,
  'hot': 1,
  'fire.': 1}]

In [None]:
#mapPartitions allows you to perform an expensive operation 
#e.g. connecting to database once for each partition, instead of once for each element:
def faster_lookup(iterator):
    db = make_db_connection()
    for id in iterator:
        yield db.lookup(id)
# this iterates through all of the items of iterator, but creating db connection just once per partition

# 15

The only difference from mapPartitions is that also provides partition index (besides iterator)

In [None]:
def store(index, iterator):
    db = make_db_connection()
    for id in iterator:
        yield db.store(id, shard=index)

# 16

In [50]:
data = sc.parallelize(range(10000))

In [51]:
data.count()

10000

In [53]:
data.sample(withReplacement=False, fraction=0.1, seed=None).count()
# fraction parameter represents the aproximate fraction of the dataset that will be returned. 

979

# 17

In [54]:
rdd1 = sc.parallelize(range(5))
rdd1.collect()

[0, 1, 2, 3, 4]

In [55]:
rdd2 = sc.parallelize(range(4, 10))
rdd2.collect()

[4, 5, 6, 7, 8, 9]

In [56]:
rdd1.union(rdd2).collect()

[0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9]

# 18

In [57]:
rdd1 = sc.parallelize([1,1,2,3,4,5])
rdd1.collect()

[1, 1, 2, 3, 4, 5]

In [58]:
rdd2 = sc.parallelize([1,1,4,6])
rdd2.collect()

[1, 1, 4, 6]

In [59]:
rdd1.intersection(rdd2).collect()

[1, 4]

In [60]:
# reduce happens under the hood (for duplicate removal) -> can slow down execution

# 19

In [62]:
rdd = sc.parallelize(['a', 'b']).cartesian(sc.parallelize(range(10)))
rdd.collect()

[('a', 0),
 ('a', 1),
 ('a', 2),
 ('a', 3),
 ('a', 4),
 ('a', 5),
 ('a', 6),
 ('a', 7),
 ('a', 8),
 ('a', 9),
 ('b', 0),
 ('b', 1),
 ('b', 2),
 ('b', 3),
 ('b', 4),
 ('b', 5),
 ('b', 6),
 ('b', 7),
 ('b', 8),
 ('b', 9)]

In [63]:
first = rdd.map(lambda x: x[0])
first.collect()

['a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b']

In [66]:
first.distinct().collect()
# disstinct has optional argument "numPartitions", the larger is it, the greater the parallelism will be
# distinct is a "reduce-type" transformation -> can be slow

['b', 'a']

# 20

In [67]:
#Cartesian product (all possible pairs) in Python:
ice_creams = range(5)
cookies = range(7)
[(a,b) for a in ice_creams for b in cookies]

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (3, 6),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 4),
 (4, 5),
 (4, 6)]

In [None]:
#Cartesian product (all possible pairs) in Spark:

In [68]:
ice_creams = sc.parallelize(range(5))

In [69]:
cookies = sc.parallelize(range(7))

In [70]:
combinations = ice_creams.cartesian(cookies)

In [71]:
combinations.collect()

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (3, 0),
 (4, 0),
 (3, 1),
 (3, 2),
 (4, 1),
 (4, 2),
 (3, 3),
 (3, 4),
 (4, 3),
 (4, 4),
 (3, 5),
 (3, 6),
 (4, 5),
 (4, 6)]

For large data it makes sense to perform cartesian product on data subsets. Consider using join operations or broadcast variables.

# 21

In [72]:
numbers = sc.parallelize(range(11))

In [73]:
# selecting records that contain 1
numbers.pipe('grep 1').collect()

['1', '10']

Pipe passes all of the data as strings

Pipe can be used with any tool, program, language, that can take values from stdIN and write to stdOUT

 input and output happens at the partition level -> no need to worry about CLI overhead as it will be started just once per partition

In [74]:
rdd = sc.parallelize(['b,b', 'c,c,c', 'a'])
rdd.collect()

['b,b', 'c,c,c', 'a']

In [75]:
# using pipe to capitalize everything
rdd.pipe("tr '[a-z]' '[A-Z]'").collect()

['B,B', 'C,C,C', 'A']

In [76]:
# using pipe as a filter
rdd.pipe('grep a').collect()

['a']

In [77]:
# using pipe as a flatmap: one line in stdIN, multiple lines in stdOUT
rdd.pipe("tr -s ',' '[\n*]'").collect()

['b', 'b', 'c', 'c', 'c', 'a']

# 22

Reduces number of partitions in an efficeint way by combining partitions that are on the same executors

In [78]:
rdd = sc.parallelize(range(10000), numSlices=100)
# rdd with a 100 partitions

In [79]:
rdd2 = rdd.coalesce(10)
# reduced # of partitions from 100 to 10 with minimum of data movement between executors

Number of partitions is an upper bound for parallelism: we can't have 5 processors working on 3 partitions

Spark recommends 2-4 partitions per cpu in the cluster

Too many partitions -> every time we start a comutation, many tast overhead

# 23 

Repartition allows you to set the resulting number of partitions. Similar to coalesce, but we can also grow number of partitions.

In [81]:
numbers = sc.parallelize(range(1000), numSlices=1)

In [82]:
numbers.repartition(100)

MapPartitionsRDD[82] at coalesce at NativeMethodAccessorImpl.java:0

Sending data from its current location to a new host executor which can be a lot of network traffic. For reducing number of partitions use coalesce method as it often requires less network traffic.

# 24

Repartition and sorting afterwards is less efficient than RepartitionAndSortWithinPartitions

Sorting happens for a key-value pairs by keys

In [86]:
pairs = sc.parallelize([[1,2], [1,1], [2,3], [3,3]])

In [87]:
pairs.repartitionAndSortWithinPartitions(2).glom().collect()

[[(2, 3)], [(1, 2), (1, 1), (3, 3)]]

glom takes all of the elements within a partition and puts them to a list and returns RDD of those lists.

2 lists above represent 2 partitions we requested to make. Within those partitions pairs are sorted based on keys

In [88]:
pairs.repartitionAndSortWithinPartitions(2, partitionFunc=lambda x: x == 1).glom().collect()

[[(2, 3), (3, 3)], [(1, 2), (1, 1)]]

partitionFunc=lambda x: x == 1 -> for each key that comes to partition, does it equal to 1?

additional params:
    
    -ascending
    
    -keyfunc