In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local", "BroadCast")

22/05/06 01:32:45 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/06 01:32:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/06 01:32:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/06 01:32:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/06 01:32:57 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/06 01:32:57 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/06 01:32:57 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
# python object, located in Driver/notebook program memory
sector_dict = {
        "MSFT": "TECH",
        "TSLA": "AUTO",
        "EMR": "INDUSTRIAL"
}

stocks = [
    ("EMR", 52.0),
    ("TSLA", 300.0),
    ("MSFT", 100.0)
]

In [4]:
stocksRdd = sc.parallelize(stocks)

In [5]:
# code below shall be executed in executor, where as we refer sector_dict in code
# how come sector_dict located into driver shall be available in executor?
def enrichStockWithSectorWithoutBroadCast(stock):
    return stock + (sector_dict[stock[0]] ,)

# code marshalling - Python copy the lamnda code to executor system/processor
# now enrichStockWithSectorWithoutBroadCast also shipped to executor on every task
# sector_dict is copied into executor along with every task
# if we have 100 partitions, then we will have 100 tasks, then sector_dict is copied 100 times into exeuctor
enrichedRdd = stocksRdd.map (lambda stock: enrichStockWithSectorWithoutBroadCast(stock))
enrichedRdd.take(5)

                                                                                

[('EMR', 52.0, 'INDUSTRIAL'), ('TSLA', 300.0, 'AUTO'), ('MSFT', 100.0, 'TECH')]

In [6]:
# create broadcasted variabel using Spark Context
# this will ensure that sector_dict is kept in every executor 
# where task shall be running
# lazy evaluation, data shall be copied to executors when we run the job
broadCastSectorDict = sc.broadcast(sector_dict)

# Pyspark see this code, this has reference to broadCastSectorDict
# which is broardcast data, pyspark place the broadCastSectorDict in every executor only 
# 1 time instead of every job
# without broadCast, sector_dict shall be copied to executor for every task
# add sector code with stock at executor level when task running
def enrichStockWithSector(stock):
    return stock + (broadCastSectorDict.value[stock[0]] ,)

# code marshalling - Python copy the lamnda code to executor system/processor
# now enrichStockWithSector also shipped to executor on every task
enrichedRdd = stocksRdd.map (lambda stock: enrichStockWithSector(stock))

enrichedRdd.take(5)

                                                                                

[('EMR', 52.0, 'INDUSTRIAL'), ('TSLA', 300.0, 'AUTO'), ('MSFT', 100.0, 'TECH')]