# Part III. Low-Level APIs
*by Bill Chambers and Matei Zaharia* 

*Notebook authored by Tarek Allam Jr.*

## Setup environment

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

# Chapter 12. Resilient Distributed Datasets (RDDs)

In [2]:
spark.range(10).rdd

MapPartitionsRDD[5] at javaToPython at NativeMethodAccessorImpl.java:0

In [3]:
spark.range(10).toDF("id").rdd.map(lambda row: row[0])

PythonRDD[12] at RDD at PythonRDD.scala:53

In [4]:
spark.range(10).rdd.toDF()

DataFrame[id: bigint]

In [5]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)

In [6]:
words.setName("myWords")
words.name() # myWords

'myWords'

In [7]:
def startsWithS(individual):
  return individual.startswith("S")

In [8]:
words.filter(lambda word: startsWithS(word)).collect()

['Spark', 'Simple']

In [9]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))

In [10]:
words2.filter(lambda record: record[2]).take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]

In [11]:
words.flatMap(lambda word: list(word)).take(5)

['S', 'p', 'a', 'r', 'k']

In [12]:
words.sortBy(lambda word: len(word) * -1).take(2)

['Definitive', 'Processing']

In [13]:
fiftyFiftySplit = words.randomSplit([0.5, 0.5])

In [14]:
spark.sparkContext.parallelize(range(1, 21)).reduce(lambda x, y: x + y) # 210

210

In [52]:
spark.sparkContext.parallelize(list(range(1, 21))).reduce(lambda x, y: x + y) # 210

210

In [53]:
!python --version

Python 3.7.6


In [15]:
def wordLengthReducer(leftWord, rightWord):
  if len(leftWord) > len(rightWord):
    return leftWord
  else:
    return rightWord

words.reduce(wordLengthReducer)

'Processing'

In [16]:
words.getStorageLevel()

StorageLevel(False, False, False, False, 1)

In [17]:
words.mapPartitions(lambda part: [1]).sum() # 2

2

In [18]:
def indexedFunc(partitionIndex, withinPartIterator):
  return ["partition: {} => {}".format(partitionIndex,
    x) for x in withinPartIterator]
words.mapPartitionsWithIndex(indexedFunc).collect()

['partition: 0 => Spark',
 'partition: 0 => The',
 'partition: 0 => Definitive',
 'partition: 0 => Guide',
 'partition: 0 => :',
 'partition: 1 => Big',
 'partition: 1 => Data',
 'partition: 1 => Processing',
 'partition: 1 => Made',
 'partition: 1 => Simple']

In [19]:
spark.sparkContext.parallelize(["Hello", "World"], 2).glom().collect()
# [['Hello'], ['World']]

[['Hello'], ['World']]

# Chapter 13. Advanced RDDs

In [20]:
sc = spark.sparkContext

In [21]:
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = sc.parallelize(myCollection, 2)

In [22]:
words.map(lambda word: (word.lower(), 1))

PythonRDD[44] at RDD at PythonRDD.scala:53

In [23]:
keyword = words.keyBy(lambda word: word.lower()[0])

In [24]:
keyword.mapValues(lambda word: word.upper()).collect()

[('s', 'SPARK'),
 ('t', 'THE'),
 ('d', 'DEFINITIVE'),
 ('g', 'GUIDE'),
 (':', ':'),
 ('b', 'BIG'),
 ('d', 'DATA'),
 ('p', 'PROCESSING'),
 ('m', 'MADE'),
 ('s', 'SIMPLE')]

In [25]:
keyword.flatMapValues(lambda word: word.upper()).collect()

[('s', 'S'),
 ('s', 'P'),
 ('s', 'A'),
 ('s', 'R'),
 ('s', 'K'),
 ('t', 'T'),
 ('t', 'H'),
 ('t', 'E'),
 ('d', 'D'),
 ('d', 'E'),
 ('d', 'F'),
 ('d', 'I'),
 ('d', 'N'),
 ('d', 'I'),
 ('d', 'T'),
 ('d', 'I'),
 ('d', 'V'),
 ('d', 'E'),
 ('g', 'G'),
 ('g', 'U'),
 ('g', 'I'),
 ('g', 'D'),
 ('g', 'E'),
 (':', ':'),
 ('b', 'B'),
 ('b', 'I'),
 ('b', 'G'),
 ('d', 'D'),
 ('d', 'A'),
 ('d', 'T'),
 ('d', 'A'),
 ('p', 'P'),
 ('p', 'R'),
 ('p', 'O'),
 ('p', 'C'),
 ('p', 'E'),
 ('p', 'S'),
 ('p', 'S'),
 ('p', 'I'),
 ('p', 'N'),
 ('p', 'G'),
 ('m', 'M'),
 ('m', 'A'),
 ('m', 'D'),
 ('m', 'E'),
 ('s', 'S'),
 ('s', 'I'),
 ('s', 'M'),
 ('s', 'P'),
 ('s', 'L'),
 ('s', 'E')]

In [26]:
keyword.keys().collect()
keyword.values().collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple']

In [27]:
import random
distinctChars = words.flatMap(lambda word: list(word.lower())).distinct()\
  .collect()
sampleMap = dict(map(lambda c: (c, random.random()), distinctChars))
words.map(lambda word: (word.lower()[0], word))\
  .sampleByKey(True, sampleMap, 6).collect()

[('t', 'The'), ('g', 'Guide'), ('m', 'Made')]

In [28]:
chars = words.flatMap(lambda word: word.lower())
KVcharacters = chars.map(lambda letter: (letter, 1))
def maxFunc(left, right):
  return max(left, right)
def addFunc(left, right):
  return left + right
nums = spark.sparkContext.parallelize(range(1,31), 5)

In [29]:
KVcharacters.countByKey()

defaultdict(int,
            {'s': 4,
             'p': 3,
             'a': 4,
             'r': 2,
             'k': 1,
             't': 3,
             'h': 1,
             'e': 7,
             'd': 4,
             'f': 1,
             'i': 7,
             'n': 2,
             'v': 1,
             'g': 3,
             'u': 1,
             ':': 1,
             'b': 1,
             'o': 1,
             'c': 1,
             'm': 2,
             'l': 1})

In [30]:
# KVcharacters.groupByKey().map(lambda row: (row[0], reduce(addFunc, row[1])))\
#   .collect()
# note this is Python 2, reduce must be imported from functools in Python 3

In [31]:
nums.aggregate(0, maxFunc, addFunc)

90

In [32]:
depth = 3
nums.treeAggregate(0, maxFunc, addFunc, depth)

90

In [33]:
KVcharacters.aggregateByKey(0, addFunc, maxFunc).collect()

[('s', 3),
 ('p', 2),
 ('r', 1),
 ('h', 1),
 ('d', 2),
 ('i', 4),
 ('g', 2),
 ('b', 1),
 ('c', 1),
 ('l', 1),
 ('a', 3),
 ('k', 1),
 ('t', 2),
 ('e', 4),
 ('f', 1),
 ('n', 1),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('o', 1),
 ('m', 2)]

In [34]:
def valToCombiner(value):
  return [value]
def mergeValuesFunc(vals, valToAppend):
  vals.append(valToAppend)
  return vals
def mergeCombinerFunc(vals1, vals2):
  return vals1 + vals2
outputPartitions = 6
KVcharacters\
  .combineByKey(
    valToCombiner,
    mergeValuesFunc,
    mergeCombinerFunc,
    outputPartitions)\
  .collect()

[('s', [1, 1, 1, 1]),
 ('d', [1, 1, 1, 1]),
 ('l', [1]),
 ('v', [1]),
 (':', [1]),
 ('p', [1, 1, 1]),
 ('r', [1, 1]),
 ('c', [1]),
 ('k', [1]),
 ('t', [1, 1, 1]),
 ('n', [1, 1]),
 ('u', [1]),
 ('o', [1]),
 ('h', [1]),
 ('i', [1, 1, 1, 1, 1, 1, 1]),
 ('g', [1, 1, 1]),
 ('b', [1]),
 ('a', [1, 1, 1, 1]),
 ('e', [1, 1, 1, 1, 1, 1, 1]),
 ('f', [1]),
 ('m', [1, 1])]

In [35]:
KVcharacters.foldByKey(0, addFunc).collect()

[('s', 4),
 ('p', 3),
 ('r', 2),
 ('h', 1),
 ('d', 4),
 ('i', 7),
 ('g', 3),
 ('b', 1),
 ('c', 1),
 ('l', 1),
 ('a', 4),
 ('k', 1),
 ('t', 3),
 ('e', 7),
 ('f', 1),
 ('n', 2),
 ('v', 1),
 ('u', 1),
 (':', 1),
 ('o', 1),
 ('m', 2)]

In [36]:
import random
distinctChars = words.flatMap(lambda word: word.lower()).distinct()
charRDD = distinctChars.map(lambda c: (c, random.random()))
charRDD2 = distinctChars.map(lambda c: (c, random.random()))
charRDD.cogroup(charRDD2).take(5)

[('s',
  (<pyspark.resultiterable.ResultIterable at 0x11a57a6d0>,
   <pyspark.resultiterable.ResultIterable at 0x11a57a210>)),
 ('p',
  (<pyspark.resultiterable.ResultIterable at 0x11a57a610>,
   <pyspark.resultiterable.ResultIterable at 0x11a57af50>)),
 ('r',
  (<pyspark.resultiterable.ResultIterable at 0x11a57ab10>,
   <pyspark.resultiterable.ResultIterable at 0x11a57a5d0>)),
 ('i',
  (<pyspark.resultiterable.ResultIterable at 0x11a57a8d0>,
   <pyspark.resultiterable.ResultIterable at 0x11a57ad90>)),
 ('g',
  (<pyspark.resultiterable.ResultIterable at 0x11a583dd0>,
   <pyspark.resultiterable.ResultIterable at 0x11a583890>))]

In [37]:
keyedChars = distinctChars.map(lambda c: (c, random.random()))
outputPartitions = 10
KVcharacters.join(keyedChars).count()
KVcharacters.join(keyedChars, outputPartitions).count()

51

In [38]:
numRange = sc.parallelize(range(10), 2)
words.zip(numRange).collect()

[('Spark', 0),
 ('The', 1),
 ('Definitive', 2),
 ('Guide', 3),
 (':', 4),
 ('Big', 5),
 ('Data', 6),
 ('Processing', 7),
 ('Made', 8),
 ('Simple', 9)]

In [39]:
words.coalesce(1).getNumPartitions() # 1

1

In [40]:
df = spark.read.option("header", "true").option("inferSchema", "true")\
  .csv("../data/retail-data/all/")
rdd = df.coalesce(10).rdd

In [41]:
def partitionFunc(key):
  import random
  if key == 17850 or key == 12583:
    return 0
  else:
    return random.randint(1,2)

keyedRDD = rdd.keyBy(lambda row: row[6])
keyedRDD\
  .partitionBy(3, partitionFunc)\
  .map(lambda x: x[0])\
  .glom()\
  .map(lambda x: len(set(x)))\
  .take(5)

[2, 4295, 4308]

# Chapter 14. Distributed Shared Variables

In [42]:
my_collection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
  .split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)

In [43]:
supplementalData = {"Spark":1000, "Definitive":200,
                    "Big":-300, "Simple":100}

In [44]:
suppBroadcast = spark.sparkContext.broadcast(supplementalData)

In [45]:
suppBroadcast.value

{'Spark': 1000, 'Definitive': 200, 'Big': -300, 'Simple': 100}

In [46]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0)))\
  .sortBy(lambda wordPair: wordPair[1])\
  .collect()


[('Big', -300),
 ('The', 0),
 ('Guide', 0),
 (':', 0),
 ('Data', 0),
 ('Processing', 0),
 ('Made', 0),
 ('Simple', 100),
 ('Definitive', 200),
 ('Spark', 1000)]

In [47]:
flights = spark.read\
  .parquet("../data/flight-data/parquet/2010-summary.parquet")

In [48]:
accChina = spark.sparkContext.accumulator(0)

In [49]:
def accChinaFunc(flight_row):
  destination = flight_row["DEST_COUNTRY_NAME"]
  origin = flight_row["ORIGIN_COUNTRY_NAME"]
  if destination == "China":
    accChina.add(flight_row["count"])
  if origin == "China":
    accChina.add(flight_row["count"])

In [50]:
flights.foreach(lambda flight_row: accChinaFunc(flight_row))

In [51]:
accChina.value # 953

953