In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# Join Operation in Spark

In [None]:
# Let us create some RDDs
valueRDDA = sc.parallelize(['a', 'b', 'c', 'd', 'e', 'f'])
valueRDDB = sc.parallelize(['AA', 'BB', 'DD'])

rddB = sc.parallelize([1, 3, 2, 2, 3, 1])
rddC = sc.parallelize([1, 2, 4])

print(valueRDDA.collect())
print(valueRDDB.collect())
print(rddB.collect())
print(rddC.collect())


['a', 'b', 'c', 'd', 'e', 'f']
['AA', 'BB', 'DD']
[1, 3, 2, 2, 3, 1]
[1, 2, 4]


In [3]:
# Now, we can zip these RDDs to create new RDDs with different keys

rdd1 = rddB.zip(valueRDDA)
rdd2 = rddC.zip(valueRDDB)

print("RDD 1 : ", rdd1.collect())
print("RDD 2 : ", rdd2.collect())

RDD 1 :  [(1, 'a'), (3, 'b'), (2, 'c'), (2, 'd'), (3, 'e'), (1, 'f')]
RDD 2 :  [(1, 'AA'), (2, 'BB'), (4, 'DD')]


## Normal Join operation

In [4]:
rdd3=rdd1.join(rdd2)

rdd3.collect()

[(2, ('c', 'BB')), (2, ('d', 'BB')), (1, ('a', 'AA')), (1, ('f', 'AA'))]

# Map-Side Join Operation

If we are joining a small RDD with a large RDD and the small RDD is so small that it can fit into the main memory of a single executor, then we can convert the Join operation into a map operation. This is then called Map-Side Join operation. 

1. Collect the small RDD as Map (a dict in python)
2. Broadcast the small dictionary so that a copy of it is avaiable on each worker. 
3. Do the map to run the join operation instead of the actual join operation

In [5]:
# Collect the small RDD as Map (a dict in python)
my_small_Data = rdd2.collectAsMap()

print(my_small_Data)

# Now we broad cast this to all worker nodes. 
sc.broadcast(my_small_Data)

# We can do a simple map on it. 
rdd3=rdd1.map(lambda x: 
        (x[0], (x[1], my_small_Data.get(x[0]))) if x[0] in  my_small_Data.keys() else None )\
        .filter(lambda x: x!=None)

rdd3.collect()

{1: 'AA', 2: 'BB', 4: 'DD'}


[(1, ('a', 'AA')), (2, ('c', 'BB')), (2, ('d', 'BB')), (1, ('f', 'AA'))]

# Joining a large and a medium size RDD

In case that the RDD is large so that it can not fit into the memory, then maybe the keys only can fit into the memory. This would allow us to keep the keys of the medium size RDD in memory and use it to reduce the size of the large RDD and then run the join operation on it. 

1. Collect the Keys of the Medium size RDD into a set of keys 
2. Use the keys to filter the large RDD and reduce the size of it
3. Then run the join on the smaller RDD


In [7]:
# Collect the Keys of the Medium size RDD into a set of keys
keys = set(rdd2.map(lambda x: x[0]).collect())

print(keys)
# Now we broadcast keys to all worker nodes. 
sc.broadcast(keys)

# We can do a simple map on it. 
rdd3 = rdd1.map(lambda x: (x[0], x[1] if x[0] in  keys else None ))\
           .filter(lambda x: x[1]!=None)

# Now we can run the join operation on a smaller RDD
rdd4=rdd3.join(rdd2)

rdd4.collect()

{1, 2, 4}


[(2, ('c', 'BB')), (2, ('d', 'BB')), (1, ('a', 'AA')), (1, ('f', 'AA'))]