In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("spark join strategies")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [3]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
orders_df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [4]:
customers_schema = """customerid long, customer_fname string, customer_lname string, username string, password string,
address string, city string, state string, pincode long"""
customers_df = spark.read.format("csv").schema(customers_schema).load("/public/trendytech/retail_db/customers")

In [5]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'10485760b'

### Broadcast Hash Join

In [6]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"inner").write.format("noop").mode("overwrite").save()

### Sort Merge Join

In [7]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold','-1')

In [8]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'-1'

In [9]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"inner").write.format("noop").mode("overwrite").save()

In [10]:
orders_df.join(customers_df,orders_df.customer_id==customers_df.customerid,"inner").explain()

== Physical Plan ==
*(5) SortMergeJoin [customer_id#10L], [customerid#16L], Inner
:- *(2) Sort [customer_id#10L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(customer_id#10L, 200), ENSURE_REQUIREMENTS, [id=#118]
:     +- *(1) Filter isnotnull(customer_id#10L)
:        +- FileScan csv [order_id#8L,order_date#9,customer_id#10L,order_status#11] Batched: false, DataFilters: [isnotnull(customer_id#10L)], Format: CSV, Location: InMemoryFileIndex[hdfs://m01.itversity.com:9000/public/trendytech/orders/orders_1gb.csv], PartitionFilters: [], PushedFilters: [IsNotNull(customer_id)], ReadSchema: struct<order_id:bigint,order_date:string,customer_id:bigint,order_status:string>
+- *(4) Sort [customerid#16L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(customerid#16L, 200), ENSURE_REQUIREMENTS, [id=#126]
      +- *(3) Filter isnotnull(customerid#16L)
         +- FileScan csv [customerid#16L,customer_fname#17,customer_lname#18,username#19,password#20,address#21,city#22,state#2

In [11]:
orders_df.join(customers_df.hint("shuffle_hash"),orders_df.customer_id==customers_df.customerid,"inner").explain()

== Physical Plan ==
*(3) ShuffledHashJoin [customer_id#10L], [customerid#16L], Inner, BuildRight
:- Exchange hashpartitioning(customer_id#10L, 200), ENSURE_REQUIREMENTS, [id=#160]
:  +- *(1) Filter isnotnull(customer_id#10L)
:     +- FileScan csv [order_id#8L,order_date#9,customer_id#10L,order_status#11] Batched: false, DataFilters: [isnotnull(customer_id#10L)], Format: CSV, Location: InMemoryFileIndex[hdfs://m01.itversity.com:9000/public/trendytech/orders/orders_1gb.csv], PartitionFilters: [], PushedFilters: [IsNotNull(customer_id)], ReadSchema: struct<order_id:bigint,order_date:string,customer_id:bigint,order_status:string>
+- Exchange hashpartitioning(customerid#16L, 200), ENSURE_REQUIREMENTS, [id=#165]
   +- *(2) Filter isnotnull(customerid#16L)
      +- FileScan csv [customerid#16L,customer_fname#17,customer_lname#18,username#19,password#20,address#21,city#22,state#23,pincode#24L] Batched: false, DataFilters: [isnotnull(customerid#16L)], Format: CSV, Location: InMemoryFileIndex[hd

In [12]:
orders_df.join(customers_df.hint("shuffle_hash"),orders_df.customer_id==customers_df.customerid,"inner").write.format("noop").mode("overwrite").save()

In [13]:
orders_df.join(customers_df.hint("shuffle_merge"),orders_df.customer_id==customers_df.customerid,"inner").write.format("noop").mode("overwrite").save()