In [1]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("spark-guide-book")\
    .getOrCreate()

In [2]:
my_collection = "Spark The Definitive Guide : Big Data Processing Made Simple, Spark in the Park, very powerful"\
  .split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)

In [4]:
words.collect()

['Spark',
 'The',
 'Definitive',
 'Guide',
 ':',
 'Big',
 'Data',
 'Processing',
 'Made',
 'Simple,',
 'Spark',
 'in',
 'the',
 'Park,',
 'very',
 'powerful']

In [5]:
# COMMAND ----------

supplementalData = {"Spark":1000, "Definitive":200,
                    "Big":-300, "Simple":100}


# COMMAND ----------

suppBroadcast = spark.sparkContext.broadcast(supplementalData)


# COMMAND ----------

suppBroadcast.value

{'Spark': 1000, 'Definitive': 200, 'Big': -300, 'Simple': 100}

In [6]:
# COMMAND ----------

words.map(lambda word: (word, suppBroadcast.value.get(word, 0)))\
  .sortBy(lambda wordPair: wordPair[1])\
  .collect()

[('Big', -300),
 ('The', 0),
 ('Guide', 0),
 (':', 0),
 ('Data', 0),
 ('Processing', 0),
 ('Made', 0),
 ('Simple,', 0),
 ('in', 0),
 ('the', 0),
 ('Park,', 0),
 ('very', 0),
 ('powerful', 0),
 ('Definitive', 200),
 ('Spark', 1000),
 ('Spark', 1000)]

In [7]:
# COMMAND ----------

flights = spark.read\
  .parquet("../data/flight-data/parquet/2010-summary.parquet")

In [8]:
# COMMAND ----------

accChina = spark.sparkContext.accumulator(0)

In [9]:
# COMMAND ----------

def accChinaFunc(flight_row):
  destination = flight_row["DEST_COUNTRY_NAME"]
  origin = flight_row["ORIGIN_COUNTRY_NAME"]
  if destination == "China":
    accChina.add(flight_row["count"])
  if origin == "China":
    accChina.add(flight_row["count"])


# COMMAND ----------

flights.foreach(lambda flight_row: accChinaFunc(flight_row))


# COMMAND ----------

accChina.value # 953

953

In [None]:
# COMMAND ----------