In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window as WN

In [2]:
spark = SparkSession.builder.master("local[2]").appName("Test").getOrCreate()

In [3]:
data = [
    {'a':1},{'a':2},{'a':1},{'a':3},{'a':None}
]
schema = "a int"
df1 = spark.createDataFrame(data =data,schema = schema)
df1.show()

+----+
|   a|
+----+
|   1|
|   2|
|   1|
|   3|
|NULL|
+----+



In [4]:
data2 = [
    {'b':1},{'b':2},{'b':2},{'b':4},{'b':None}
]
schema2 = "b int"
df2 = spark.createDataFrame(data =data2,schema = schema2)
df2.show()

+----+
|   b|
+----+
|   1|
|   2|
|   2|
|   4|
|NULL|
+----+



### Inner Join Records

In [5]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="inner").count())
df1.join(df2,df1.a==df2.b,how="inner").show()

Rows_count: 4
+---+---+
|  a|  b|
+---+---+
|  1|  1|
|  1|  1|
|  2|  2|
|  2|  2|
+---+---+



### left Join Records

In [6]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="left").count())
df1.join(df2,df1.a==df2.b,how="left").show()

Rows_count: 6
+----+----+
|   a|   b|
+----+----+
|   1|   1|
|   2|   2|
|   2|   2|
|NULL|NULL|
|   1|   1|
|   3|NULL|
+----+----+



### Right Join Records

In [7]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="right").count())
df1.join(df2,df1.a==df2.b,how="right").show()

Rows_count: 6
+----+----+
|   a|   b|
+----+----+
|   1|   1|
|   1|   1|
|   2|   2|
|NULL|NULL|
|NULL|   4|
|   2|   2|
+----+----+



#### semi, leftsemi, left_semi
<br>leftsemi join is similar to inner join difference being leftsemi join returns all columns from the left dataset and ignores all columns from the right dataset

In [8]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="semi").count())
df1.join(df2,df1.a==df2.b,how="semi").show()

Rows_count: 3
+---+
|  a|
+---+
|  1|
|  1|
|  2|
+---+



### Left Anti Join
(anti, leftanti, left_anti)<br>
leftanti join does the exact opposite of the leftsemi, leftanti join returns only columns from the left dataset for non-matched records.

In [9]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="anti").count())
df1.join(df2,df1.a==df2.b,how="anti").show()

Rows_count: 2
+----+
|   a|
+----+
|NULL|
|   3|
+----+



### outer, full, fullouter, full_outer

In [10]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="full").count())
df1.join(df2,df1.a==df2.b,how="full").show()

Rows_count: 8
+----+----+
|   a|   b|
+----+----+
|NULL|NULL|
|NULL|NULL|
|   1|   1|
|   1|   1|
|   2|   2|
|   2|   2|
|   3|NULL|
|NULL|   4|
+----+----+



## Scenerio-2

In [11]:
data = [
    {'a':1},{'a':1},{'a':0},{'a':None},{'a':None}
]
schema = "a int"
df1 = spark.createDataFrame(data =data,schema = schema)
df1.show()

+----+
|   a|
+----+
|   1|
|   1|
|   0|
|NULL|
|NULL|
+----+



In [12]:
data = [
    {'b':1},{'b':0},{'b':0}
]
schema = "b int"
df2 = spark.createDataFrame(data =data,schema = schema)
df2.show()

+---+
|  b|
+---+
|  1|
|  0|
|  0|
+---+



In [13]:
### Inner Join Records

In [14]:
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="inner").count())
df1.join(df2,df1.a==df2.b,how="inner").show()

Rows_count: 4
+---+---+
|  a|  b|
+---+---+
|  0|  0|
|  0|  0|
|  1|  1|
|  1|  1|
+---+---+



In [15]:
### left Join Records
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="left").count())
df1.join(df2,df1.a==df2.b,how="left").show()

Rows_count: 6
+----+----+
|   a|   b|
+----+----+
|   1|   1|
|   1|   1|
|NULL|NULL|
|NULL|NULL|
|   0|   0|
|   0|   0|
+----+----+



In [16]:
### Right Join Records
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="right").count())
df1.join(df2,df1.a==df2.b,how="right").show()

Rows_count: 4
+---+---+
|  a|  b|
+---+---+
|  1|  1|
|  1|  1|
|  0|  0|
|  0|  0|
+---+---+



In [17]:
#### semi, leftsemi, left_semi
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="semi").count())
df1.join(df2,df1.a==df2.b,how="semi").show()

Rows_count: 3
+---+
|  a|
+---+
|  0|
|  1|
|  1|
+---+



In [18]:
### Left Anti Join
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="anti").count())
df1.join(df2,df1.a==df2.b,how="anti").show()

Rows_count: 2
+----+
|   a|
+----+
|NULL|
|NULL|
+----+



In [19]:
### outer, full, fullouter, full_outer
print("Rows_count:",df1.join(df2,df1.a==df2.b,how="full").count())
df1.join(df2,df1.a==df2.b,how="full").show()

Rows_count: 6
+----+----+
|   a|   b|
+----+----+
|NULL|NULL|
|NULL|NULL|
|   0|   0|
|   0|   0|
|   1|   1|
|   1|   1|
+----+----+

