In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-08-join")\
    .getOrCreate()

In [2]:
spark

In [14]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])])\
  .toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")])\
  .toDF("id_program", "degree", "department", "school")

sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")])\
  .toDF("id_status", "status")

In [15]:
person.show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [16]:
graduateProgram.show()

+----------+-------+--------------------+-----------+
|id_program| degree|          department|     school|
+----------+-------+--------------------+-----------+
|         0|Masters|School of Informa...|UC Berkeley|
|         2|Masters|                EECS|UC Berkeley|
|         1|  Ph.D.|                EECS|UC Berkeley|
+----------+-------+--------------------+-----------+



In [17]:
sparkStatus.show()

+---------+--------------+
|id_status|        status|
+---------+--------------+
|      500|Vice President|
|      250|    PMC Member|
|      100|   Contributor|
+---------+--------------+



In [18]:
person.createOrReplaceTempView("person")
spark.sql("select * from person").show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [19]:
graduateProgram.createOrReplaceTempView("graduateProgram")
spark.sql("select * from graduateProgram").show()

+----------+-------+--------------------+-----------+
|id_program| degree|          department|     school|
+----------+-------+--------------------+-----------+
|         0|Masters|School of Informa...|UC Berkeley|
|         2|Masters|                EECS|UC Berkeley|
|         1|  Ph.D.|                EECS|UC Berkeley|
+----------+-------+--------------------+-----------+



In [20]:
sparkStatus.createOrReplaceTempView("sparkStatus")
spark.sql("select * from sparkStatus").show()

+---------+--------------+
|id_status|        status|
+---------+--------------+
|      500|Vice President|
|      250|    PMC Member|
|      100|   Contributor|
+---------+--------------+



### Inner join

In [22]:
# COMMAND ----------

joinExpression = person["graduate_program"] == graduateProgram['id_program']
joinType = "inner"

In [53]:
# drop join columns
person.join(graduateProgram, joinExpression, joinType).drop("graduate_program","id_program").show()

+---+----------------+---------------+-------+--------------------+-----------+
| id|            name|   spark_status| degree|          department|     school|
+---+----------------+---------------+-------+--------------------+-----------+
|  0|   Bill Chambers|          [100]|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|[500, 250, 100]|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|     [250, 100]|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+---------------+-------+--------------------+-----------+



In [26]:
spark.sql("""
    select 
        p.*,
        g.degree,
        g.department,
        g.school
    from person p 
    inner join graduateProgram g
    on p.graduate_program = g.id_program
""").show()

+---+----------------+----------------+---------------+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| degree|          department|     school|
+---+----------------+----------------+---------------+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+-------+--------------------+-----------+



### Outer join

In [28]:
joinType = "outer"

In [29]:
person.join(graduateProgram, joinExpression, joinType).show()

+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status|id_program| degree|          department|     school|
+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|         0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|         2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+



In [31]:
spark.sql("""
    select 
        *
    from person p 
    full outer join graduateProgram g
    on p.graduate_program = g.id_program
""").show()

+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status|id_program| degree|          department|     school|
+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|         0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|         2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+



### Left Outer join

In [32]:
joinType = "left_outer"

person.join(graduateProgram, joinExpression, joinType).show()

+---+----------------+----------------+---------------+----------+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|id_program| degree|          department|     school|
+---+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|         0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+----------+-------+--------------------+-----------+



In [34]:
spark.sql("""
    select 
        *
    from person p 
    left outer join graduateProgram g
    on p.graduate_program = g.id_program
""").show()

+---+----------------+----------------+---------------+----------+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|id_program| degree|          department|     school|
+---+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|         0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+----------+-------+--------------------+-----------+



In [33]:
joinType = "left_outer"

graduateProgram.join(person, joinExpression, joinType).show()

+----------+-------+--------------------+-----------+----+----------------+----------------+---------------+
|id_program| degree|          department|     school|  id|            name|graduate_program|   spark_status|
+----------+-------+--------------------+-----------+----+----------------+----------------+---------------+
|         0|Masters|School of Informa...|UC Berkeley|   0|   Bill Chambers|               0|          [100]|
|         1|  Ph.D.|                EECS|UC Berkeley|   1|   Matei Zaharia|               1|[500, 250, 100]|
|         1|  Ph.D.|                EECS|UC Berkeley|   2|Michael Armbrust|               1|     [250, 100]|
|         2|Masters|                EECS|UC Berkeley|null|            null|            null|           null|
+----------+-------+--------------------+-----------+----+----------------+----------------+---------------+



In [35]:
spark.sql("""
    select 
        *
    from graduateProgram g 
    left outer join person p 
    on p.graduate_program = g.id_program
""").show()

+----------+-------+--------------------+-----------+----+----------------+----------------+---------------+
|id_program| degree|          department|     school|  id|            name|graduate_program|   spark_status|
+----------+-------+--------------------+-----------+----+----------------+----------------+---------------+
|         0|Masters|School of Informa...|UC Berkeley|   0|   Bill Chambers|               0|          [100]|
|         1|  Ph.D.|                EECS|UC Berkeley|   1|   Matei Zaharia|               1|[500, 250, 100]|
|         1|  Ph.D.|                EECS|UC Berkeley|   2|Michael Armbrust|               1|     [250, 100]|
|         2|Masters|                EECS|UC Berkeley|null|            null|            null|           null|
+----------+-------+--------------------+-----------+----+----------------+----------------+---------------+



### Right Outer join

In [36]:
joinType = "right_outer"

person.join(graduateProgram, joinExpression, joinType).show()

+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status|id_program| degree|          department|     school|
+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|         0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|         1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|         2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+----------+-------+--------------------+-----------+



### Left semi join

think of it as filtering left table  by right table which exists

In [37]:
joinType = "left_semi"

graduateProgram.join(person, joinExpression, joinType).show()

+----------+-------+--------------------+-----------+
|id_program| degree|          department|     school|
+----------+-------+--------------------+-----------+
|         0|Masters|School of Informa...|UC Berkeley|
|         1|  Ph.D.|                EECS|UC Berkeley|
+----------+-------+--------------------+-----------+



In [38]:
spark.sql("""
    select 
        *
    from graduateProgram g 
    left semi join person p 
    on p.graduate_program = g.id_program
""").show()

+----------+-------+--------------------+-----------+
|id_program| degree|          department|     school|
+----------+-------+--------------------+-----------+
|         0|Masters|School of Informa...|UC Berkeley|
|         1|  Ph.D.|                EECS|UC Berkeley|
+----------+-------+--------------------+-----------+



#### union

even duplicate rows in left table are kept with left-semi join

In [40]:
# COMMAND ----------

gradProgram2 = graduateProgram.union(spark.createDataFrame([
    (0, "Masters", "Duplicated Row", "Duplicated School")]))

gradProgram2.createOrReplaceTempView("gradProgram2")

In [10]:
spark.sql("select * from gradProgram2").show()

+---+-------+--------------------+-----------------+
| id| degree|          department|           school|
+---+-------+--------------------+-----------------+
|  0|Masters|School of Informa...|      UC Berkeley|
|  2|Masters|                EECS|      UC Berkeley|
|  1|  Ph.D.|                EECS|      UC Berkeley|
|  0|Masters|      Duplicated Row|Duplicated School|
+---+-------+--------------------+-----------------+



In [41]:
joinType = "left_semi"

gradProgram2.join(person, joinExpression, joinType).show()

+----------+-------+--------------------+-----------------+
|id_program| degree|          department|           school|
+----------+-------+--------------------+-----------------+
|         0|Masters|School of Informa...|      UC Berkeley|
|         1|  Ph.D.|                EECS|      UC Berkeley|
|         0|Masters|      Duplicated Row|Duplicated School|
+----------+-------+--------------------+-----------------+



In [42]:
spark.sql("""
    select 
        *
    from gradProgram2 g 
    left semi join person p 
    on p.graduate_program = g.id_program
""").show()

+----------+-------+--------------------+-----------------+
|id_program| degree|          department|           school|
+----------+-------+--------------------+-----------------+
|         0|Masters|School of Informa...|      UC Berkeley|
|         1|  Ph.D.|                EECS|      UC Berkeley|
|         0|Masters|      Duplicated Row|Duplicated School|
+----------+-------+--------------------+-----------------+



### Left anti join

think of it as filtering left table  by right table which does not exist

In [43]:
joinType = "left_anti"

graduateProgram.join(person, joinExpression, joinType).show()

+----------+-------+----------+-----------+
|id_program| degree|department|     school|
+----------+-------+----------+-----------+
|         2|Masters|      EECS|UC Berkeley|
+----------+-------+----------+-----------+



In [44]:
spark.sql("""
    select 
        *
    from graduateProgram g 
    left anti join person p 
    on p.graduate_program = g.id_program
""").show()

+----------+-------+----------+-----------+
|id_program| degree|department|     school|
+----------+-------+----------+-----------+
|         2|Masters|      EECS|UC Berkeley|
+----------+-------+----------+-----------+



### Cross (Cartisian) join

In [46]:
joinType = "cross"

graduateProgram.join(person, joinExpression, joinType).show()

+----------+-------+--------------------+-----------+---+----------------+----------------+---------------+
|id_program| degree|          department|     school| id|            name|graduate_program|   spark_status|
+----------+-------+--------------------+-----------+---+----------------+----------------+---------------+
|         0|Masters|School of Informa...|UC Berkeley|  0|   Bill Chambers|               0|          [100]|
|         1|  Ph.D.|                EECS|UC Berkeley|  1|   Matei Zaharia|               1|[500, 250, 100]|
|         1|  Ph.D.|                EECS|UC Berkeley|  2|Michael Armbrust|               1|     [250, 100]|
+----------+-------+--------------------+-----------+---+----------------+----------------+---------------+



In [47]:
spark.sql("""
    select 
        *
    from graduateProgram g 
    cross join person p 
    on p.graduate_program = g.id_program
""").show()

+----------+-------+--------------------+-----------+---+----------------+----------------+---------------+
|id_program| degree|          department|     school| id|            name|graduate_program|   spark_status|
+----------+-------+--------------------+-----------+---+----------------+----------------+---------------+
|         0|Masters|School of Informa...|UC Berkeley|  0|   Bill Chambers|               0|          [100]|
|         1|  Ph.D.|                EECS|UC Berkeley|  1|   Matei Zaharia|               1|[500, 250, 100]|
|         1|  Ph.D.|                EECS|UC Berkeley|  2|Michael Armbrust|               1|     [250, 100]|
+----------+-------+--------------------+-----------+---+----------------+----------------+---------------+



### join on complex type

In [50]:
person.withColumnRenamed("id", "id_person")\
    .join(sparkStatus, F.expr("array_contains(spark_status, id_status)"))\
    .show()

+---------+----------------+----------------+---------------+---------+--------------+
|id_person|            name|graduate_program|   spark_status|id_status|        status|
+---------+----------------+----------------+---------------+---------+--------------+
|        0|   Bill Chambers|               0|          [100]|      100|   Contributor|
|        1|   Matei Zaharia|               1|[500, 250, 100]|      500|Vice President|
|        1|   Matei Zaharia|               1|[500, 250, 100]|      250|    PMC Member|
|        1|   Matei Zaharia|               1|[500, 250, 100]|      100|   Contributor|
|        2|Michael Armbrust|               1|     [250, 100]|      250|    PMC Member|
|        2|Michael Armbrust|               1|     [250, 100]|      100|   Contributor|
+---------+----------------+----------------+---------------+---------+--------------+



In [52]:
spark.sql("""
    select 
        *
    from person p
    inner join sparkStatus s
    on array_contains(p.spark_status, s.id_status)
""").show()

+---+----------------+----------------+---------------+---------+--------------+
| id|            name|graduate_program|   spark_status|id_status|        status|
+---+----------------+----------------+---------------+---------+--------------+
|  0|   Bill Chambers|               0|          [100]|      100|   Contributor|
|  1|   Matei Zaharia|               1|[500, 250, 100]|      500|Vice President|
|  1|   Matei Zaharia|               1|[500, 250, 100]|      250|    PMC Member|
|  1|   Matei Zaharia|               1|[500, 250, 100]|      100|   Contributor|
|  2|Michael Armbrust|               1|     [250, 100]|      250|    PMC Member|
|  2|Michael Armbrust|               1|     [250, 100]|      100|   Contributor|
+---+----------------+----------------+---------------+---------+--------------+

