
# Pyspark Joins

In [1]:

# PySpark Joins Demonstration

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("PySparkJoins").getOrCreate()

# Sample DataFrames
data1 = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
data2 = [(1, "HR"), (2, "IT"), (4, "Finance")]

columns1 = ["ID", "Name"]
columns2 = ["ID", "Department"]

df1 = spark.createDataFrame(data1, columns1)
df2 = spark.createDataFrame(data2, columns2)

df1.show()
df2.show()


StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 3, Finished, Available, Finished)

+---+-------+
| ID|   Name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+

+---+----------+
| ID|Department|
+---+----------+
|  1|        HR|
|  2|        IT|
|  4|   Finance|
+---+----------+




# 1. Inner Join 
Returns Matching records only

In [2]:
inner_join_df = df1.join(df2, "ID", "inner")
inner_join_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 4, Finished, Available, Finished)

+---+-----+----------+
| ID| Name|Department|
+---+-----+----------+
|  1|Alice|        HR|
|  2|  Bob|        IT|
+---+-----+----------+



# 2. Left Join 
Keep all records from df1

In [3]:
left_join_df = df1.join(df2, "ID", "left")
left_join_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 5, Finished, Available, Finished)

+---+-------+----------+
| ID|   Name|Department|
+---+-------+----------+
|  1|  Alice|        HR|
|  2|    Bob|        IT|
|  3|Charlie|      NULL|
+---+-------+----------+




# 3. Right Join  
Keep all records from df2

In [4]:
right_join_df = df1.join(df2, "ID", "right")
right_join_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 6, Finished, Available, Finished)

+---+-----+----------+
| ID| Name|Department|
+---+-----+----------+
|  1|Alice|        HR|
|  2|  Bob|        IT|
|  4| NULL|   Finance|
+---+-----+----------+




# 4. Full Outer Join
Keep all records from both DataFrames

In [5]:
full_outer_join_df = df1.join(df2, "ID", "outer")
full_outer_join_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 7, Finished, Available, Finished)

+---+-------+----------+
| ID|   Name|Department|
+---+-------+----------+
|  1|  Alice|        HR|
|  2|    Bob|        IT|
|  3|Charlie|      NULL|
|  4|   NULL|   Finance|
+---+-------+----------+




# 5. Left Semi Join 
Return only matching records from df1

In [6]:
left_semi_df = df1.join(df2, "ID", "left_semi")
left_semi_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 8, Finished, Available, Finished)

+---+-----+
| ID| Name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+




# 6. Left Anti Join 
Return only non-matching records from df1

In [7]:
left_anti_df = df1.join(df2, "ID", "left_anti")
left_anti_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 9, Finished, Available, Finished)

+---+-------+
| ID|   Name|
+---+-------+
|  3|Charlie|
+---+-------+




# 7. Cross Join 
Cartesian product of both DataFrames

In [8]:
cross_join_df = df1.crossJoin(df2)
cross_join_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 10, Finished, Available, Finished)

+---+-------+---+----------+
| ID|   Name| ID|Department|
+---+-------+---+----------+
|  1|  Alice|  1|        HR|
|  1|  Alice|  2|        IT|
|  1|  Alice|  4|   Finance|
|  2|    Bob|  1|        HR|
|  2|    Bob|  2|        IT|
|  2|    Bob|  4|   Finance|
|  3|Charlie|  1|        HR|
|  3|Charlie|  2|        IT|
|  3|Charlie|  4|   Finance|
+---+-------+---+----------+



# 8. Join with Explicit Conditions (Different Column Names)

In [9]:
df1 = df1.withColumnRenamed("ID", "ID1")
df2 = df2.withColumnRenamed("ID", "ID2")

explicit_join_df = df1.join(df2, df1.ID1 == df2.ID2, "inner").select("ID1", "Name", "Department")
explicit_join_df.show()

StatementMeta(, c3f7bd08-72e4-4dff-96fa-ca124c4a4b4a, 11, Finished, Available, Finished)

+---+-----+----------+
|ID1| Name|Department|
+---+-----+----------+
|  1|Alice|        HR|
|  2|  Bob|        IT|
+---+-----+----------+

