In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-14-broadcast-vars")\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

sc = spark.sparkContext

In [2]:
sc

In [3]:
my_collection = "Spark The Definitive Guide : Big Data Processing Made Simple, Spark in the Park, very powerful"\
  .split(" ")
words = sc.parallelize(my_collection, 2)  # numSlices = 2

In [4]:
type(words)

pyspark.rdd.RDD

In [5]:
words.collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple,',
 'Spark',
 'in',
 'the',
 'Park,',
 'very',
 'powerful']

### Broadcast

In [6]:
# COMMAND ----------

supplementalData = {"Spark":1000, "Definitive":200,
                    "Big":-300, "Simple":100, "Data": 99}

In [7]:
# COMMAND ----------

suppBroadcast = sc.broadcast(supplementalData)

In [8]:
# COMMAND ----------

suppBroadcast.value

{'Spark': 1000, 'Definitive': 200, 'Big': -300, 'Simple': 100, 'Data': 99}

In [9]:
# COMMAND ----------

words.map(lambda word: (word, suppBroadcast.value.get(word, -10001)))\
  .sortBy(lambda wordPair: wordPair[1])\
  .collect()

[('The', -10001),
 ('Guide', -10001),
 (':', -10001),
 ('Processing', -10001),
 ('Made', -10001),
 ('Simple,', -10001),
 ('in', -10001),
 ('the', -10001),
 ('Park,', -10001),
 ('very', -10001),
 ('powerful', -10001),
 ('Big', -300),
 ('Data', 99),
 ('Definitive', 200),
 ('Spark', 1000),
 ('Spark', 1000)]

### Accumulator

In [10]:
# COMMAND ----------
file_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/parquet/2010-summary.parquet"

flights = spark.read.parquet(file_path)

In [11]:
# COMMAND ----------

accChina = sc.accumulator(0)

In [12]:
type(accChina)

pyspark.accumulators.Accumulator

In [13]:
# COMMAND ----------

def accChinaFunc(flight_row):
    if flight_row["DEST_COUNTRY_NAME"] == "China":
        accChina.add(flight_row["count"])
    if flight_row["ORIGIN_COUNTRY_NAME"] == "China":
        accChina.add(flight_row["count"])

In [14]:
# COMMAND ----------

flights.foreach(lambda flight_row: accChinaFunc(flight_row))

In [15]:
# COMMAND ----------

accChina.value # 953

953

#### verify accumulator via SQL

In [16]:
flights.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=264),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=69)]

In [17]:
type(flights)

pyspark.sql.dataframe.DataFrame

In [18]:
flights.count()

255

In [19]:
flights.where("DEST_COUNTRY_NAME='China'").selectExpr("sum(count)").show()

+----------+
|sum(count)|
+----------+
|       448|
+----------+



In [20]:
flights.where("ORIGIN_COUNTRY_NAME='China'").selectExpr("sum(count)").show()

+----------+
|sum(count)|
+----------+
|       505|
+----------+



In [21]:
flights.where("DEST_COUNTRY_NAME='China' or ORIGIN_COUNTRY_NAME='China'").selectExpr("sum(count)").show()

+----------+
|sum(count)|
+----------+
|       953|
+----------+



### RDD.glom()

Return an RDD created by coalescing all elements within each partition
into a list.

break down return results by partitions

In [22]:
rdd = sc.parallelize(range(15), 4)

In [23]:
rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [24]:
rdd.glom().collect()

[[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10], [11, 12, 13, 14]]

In [25]:
sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
# [[0], [2], [3], [4], [6]]

[[0], [2], [3], [4], [6]]

In [26]:
sc.parallelize(range(0, 6, 2), 5).glom().collect()
# [[], [0], [], [2], [4]]

[[], [0], [], [2], [4]]