**Turn off AQE and DPP**

In [0]:
spark.conf.set('spark.sql.adaptive.enabled','false')
spark.conf.set('spark.sql.optimizer.dynamicPartitionPruning.enabled','false')
spark.conf.set('spark.sql.autoBroadcastJoinThreshold',1)

**Import the necessary functions**

In [0]:
from pyspark.sql.functions import *

**Read the data**

In [0]:
path = 'dbfs:/FileStore/BigMart_Sales.csv'
df = spark.read.format('csv').\
                option('header', True).\
                option('inferschema', True).\
                load(path)

In [0]:
df = df.limit(8)
df.display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278216,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
FDO10,13.65,Regular,0.012741089,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636


**Preparing the partitioned data**

In [0]:
df.write.format('delta').\
        mode('append').\
        partitionBy('Outlet_Type').\
        save('dbfs:/FileStore/samplePartition/BigMart.CSV')

**Preparing the non partitioned data**

In [0]:
df.write.format('delta').\
        mode('append').\
        save('dbfs:/FileStore/sampleNonPartition/BigMart.CSV')

**Dataframes**

In [0]:
df1 = spark.read.format('delta').load('dbfs:/FileStore/samplePartition/BigMart.CSV')

In [0]:
df2 = spark.read.format('delta').load('dbfs:/FileStore/sampleNonPartition/BigMart.CSV')

In [0]:
#Apply filter on df2 and join with df1
df_join = df1.join(df2.filter(col('Outlet_Type') == 'Grocery Store'), on = ['Item_Identifier'], how = 'left')
df_join.display()


Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Weight.1,Item_Fat_Content.1,Item_Visibility.1,Item_Type.1,Item_MRP.1,Outlet_Identifier.1,Outlet_Establishment_Year.1,Outlet_Size.1,Outlet_Location_Type.1,Outlet_Type.1,Item_Outlet_Sales.1
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,,,,,,,,,,,
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,,,,,,,,,,,
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,,,,,,,,,,,
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,,,,,,,,,,,
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,,,,,,,,,,,
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,,,,,,,,,,,
FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636,,,,,,,,,,,
FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636,,,,,,,,,,,
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998.0,,Tier 3,Grocery Store,732.38
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998.0,,Tier 3,Grocery Store,732.38


**Enable partition pruning**

In [0]:
spark.conf.set('spark.sql.optimizer.dynamicPartitionPruning.enabled','true')

In [0]:
df_join_opt = df1.join(df2.filter(col('Outlet_Type') == 'Grocery Store'), on = ['Item_Identifier'], how = 'left')
df_join_opt.display()

Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Weight.1,Item_Fat_Content.1,Item_Visibility.1,Item_Type.1,Item_MRP.1,Outlet_Identifier.1,Outlet_Establishment_Year.1,Outlet_Size.1,Outlet_Location_Type.1,Outlet_Type.1,Item_Outlet_Sales.1
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,,,,,,,,,,,
FDN15,17.5,Low Fat,0.016760075,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,,,,,,,,,,,
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,,,,,,,,,,,
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,,,,,,,,,,,
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,,,,,,,,,,,
FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,,,,,,,,,,,
FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636,,,,,,,,,,,
FDP10,,Low Fat,0.127469857,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636,,,,,,,,,,,
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998.0,,Tier 3,Grocery Store,732.38
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998.0,,Tier 3,Grocery Store,732.38


While before DPP all the partitions were read, after DPP only the partition containing the 'grocery store' outlet type is read.
Not the case. All the 4 partitioned files were read since the join condition is different.

For DPP to work, the partitioning column and the join condition both must be the same.

So instead of on 'Outlet_Type' if we partition data on 'Item_identifier', DPP works.