In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("joins").getOrCreate()

2021-09-30 07:43:33,473 WARN util.Utils: Your hostname, tb-LinuxBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
2021-09-30 07:43:33,475 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
2021-09-30 07:43:35,993 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,ArrayType

In [4]:
person_schema = StructType([
                            StructField("id",IntegerType(),True),\
                            StructField("name",StringType(),True),\
                            StructField("graduate_program",IntegerType(),True),\
                            StructField("spark_status",ArrayType(IntegerType()),True)
                        ])

In [5]:
person = spark.createDataFrame([
                                (0,"Bill Chambers",0,[100])\
                                ,(1,"Matei Zaharia",1,[500,250,100])\
                                ,(2,"Michael Armbrust",1,[250,100])
                            ],schema=person_schema)

In [6]:
person.show(truncate=False)

                                                                                

+---+----------------+----------------+---------------+
|id |name            |graduate_program|spark_status   |
+---+----------------+----------------+---------------+
|0  |Bill Chambers   |0               |[100]          |
|1  |Matei Zaharia   |1               |[500, 250, 100]|
|2  |Michael Armbrust|1               |[250, 100]     |
+---+----------------+----------------+---------------+



In [7]:
person.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- graduate_program: integer (nullable = true)
 |-- spark_status: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [8]:
graduate_program = spark.createDataFrame([
                                    (0,"Masters","School of Information","UC Berkeley"),
                                    (2,"Masters","EECS","UC Berkeley"),
                                    (1,"Ph.D.","EECS","UC Berkeley")
                                        ]).toDF("id","degree","department","school")

In [9]:
graduate_program.printSchema()

root
 |-- id: long (nullable = true)
 |-- degree: string (nullable = true)
 |-- department: string (nullable = true)
 |-- school: string (nullable = true)



In [10]:
graduate_program.show(truncate=False)

+---+-------+---------------------+-----------+
|id |degree |department           |school     |
+---+-------+---------------------+-----------+
|0  |Masters|School of Information|UC Berkeley|
|2  |Masters|EECS                 |UC Berkeley|
|1  |Ph.D.  |EECS                 |UC Berkeley|
+---+-------+---------------------+-----------+



In [11]:
spark_status = spark.createDataFrame([
                                (500,"Vice President"),
                                (250,"PMC Member"),
                                (100,"Contributor")
                                    ]).toDF("id","status")

In [12]:
spark_status.printSchema()

root
 |-- id: long (nullable = true)
 |-- status: string (nullable = true)



In [13]:
spark_status.show()

+---+--------------+
| id|        status|
+---+--------------+
|500|Vice President|
|250|    PMC Member|
|100|   Contributor|
+---+--------------+



In [14]:
# Registering tables

In [15]:
person.createOrReplaceTempView("person")
graduate_program.createOrReplaceTempView("graduate_program")
spark_status.createOrReplaceTempView("spark_status")

In [16]:
print(person.show(truncate=False))
print(graduate_program.show(truncate=False))
print(spark_status.show(truncate=False))

+---+----------------+----------------+---------------+
|id |name            |graduate_program|spark_status   |
+---+----------------+----------------+---------------+
|0  |Bill Chambers   |0               |[100]          |
|1  |Matei Zaharia   |1               |[500, 250, 100]|
|2  |Michael Armbrust|1               |[250, 100]     |
+---+----------------+----------------+---------------+

None
+---+-------+---------------------+-----------+
|id |degree |department           |school     |
+---+-------+---------------------+-----------+
|0  |Masters|School of Information|UC Berkeley|
|2  |Masters|EECS                 |UC Berkeley|
|1  |Ph.D.  |EECS                 |UC Berkeley|
+---+-------+---------------------+-----------+

None
+---+--------------+
|id |status        |
+---+--------------+
|500|Vice President|
|250|PMC Member    |
|100|Contributor   |
+---+--------------+

None


In [17]:
# Inner Join

In [18]:
person.join(graduate_program,person.graduate_program==graduate_program.id).show()



+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+





In [19]:
spark.sql('''
        select * from 
            person 
            join graduate_program
            on person.graduate_program=graduate_program.id
''').show()

                                                                                

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+





In [20]:
# Full outer join

In [21]:
person.join(graduate_program,person.graduate_program==graduate_program.id,"fullouter").show()

                                                                                

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+





In [22]:
# left join

In [23]:
person.join(graduate_program,person.graduate_program==graduate_program.id,"leftouter").show()

                                                                                

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+





In [24]:
spark.sql('''
            select * from person
            left join graduate_program
            on person.graduate_program = graduate_program.id
            ''').show()

                                                                                

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [25]:
spark.sql('''
            select * from graduate_program
            left join person
            on person.graduate_program = graduate_program.id
            ''').show()



+---+-------+--------------------+-----------+----+----------------+----------------+---------------+
| id| degree|          department|     school|  id|            name|graduate_program|   spark_status|
+---+-------+--------------------+-----------+----+----------------+----------------+---------------+
|  0|Masters|School of Informa...|UC Berkeley|   0|   Bill Chambers|               0|          [100]|
|  1|  Ph.D.|                EECS|UC Berkeley|   1|   Matei Zaharia|               1|[500, 250, 100]|
|  1|  Ph.D.|                EECS|UC Berkeley|   2|Michael Armbrust|               1|     [250, 100]|
|  2|Masters|                EECS|UC Berkeley|null|            null|            null|           null|
+---+-------+--------------------+-----------+----+----------------+----------------+---------------+



In [26]:
graduate_program.join(person,person.graduate_program==graduate_program.id,"leftsemi").show(truncate=False)



+---+-------+---------------------+-----------+
|id |degree |department           |school     |
+---+-------+---------------------+-----------+
|0  |Masters|School of Information|UC Berkeley|
|1  |Ph.D.  |EECS                 |UC Berkeley|
+---+-------+---------------------+-----------+





In [27]:
# join on complex types

In [28]:
from pyspark.sql.functions import array_contains

In [30]:
person = person.withColumnRenamed("id","person_id")

In [31]:
person.show()

+---------+----------------+----------------+---------------+
|person_id|            name|graduate_program|   spark_status|
+---------+----------------+----------------+---------------+
|        0|   Bill Chambers|               0|          [100]|
|        1|   Matei Zaharia|               1|[500, 250, 100]|
|        2|Michael Armbrust|               1|     [250, 100]|
+---------+----------------+----------------+---------------+



In [33]:
spark_status = spark_status.withColumnRenamed("id","status_id")

In [34]:
spark_status.show()

+---------+--------------+
|status_id|        status|
+---------+--------------+
|      500|Vice President|
|      250|    PMC Member|
|      100|   Contributor|
+---------+--------------+



In [37]:
person.join(spark_status,array_contains(person.spark_status,spark_status.status_id)).show()

+---------+----------------+----------------+---------------+---------+--------------+
|person_id|            name|graduate_program|   spark_status|status_id|        status|
+---------+----------------+----------------+---------------+---------+--------------+
|        0|   Bill Chambers|               0|          [100]|      100|   Contributor|
|        1|   Matei Zaharia|               1|[500, 250, 100]|      500|Vice President|
|        1|   Matei Zaharia|               1|[500, 250, 100]|      250|    PMC Member|
|        1|   Matei Zaharia|               1|[500, 250, 100]|      100|   Contributor|
|        2|Michael Armbrust|               1|     [250, 100]|      250|    PMC Member|
|        2|Michael Armbrust|               1|     [250, 100]|      100|   Contributor|
+---------+----------------+----------------+---------------+---------+--------------+



In [38]:
graduate_program.show()

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  2|Masters|                EECS|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+



In [39]:
person.show()

+---------+----------------+----------------+---------------+
|person_id|            name|graduate_program|   spark_status|
+---------+----------------+----------------+---------------+
|        0|   Bill Chambers|               0|          [100]|
|        1|   Matei Zaharia|               1|[500, 250, 100]|
|        2|Michael Armbrust|               1|     [250, 100]|
+---------+----------------+----------------+---------------+



In [40]:
graduate_programDup = graduate_program.withColumnRenamed("id","graduate_program")

In [41]:
print(person.show(truncate=False))
print(graduate_programDup.show(truncate=False))

+---------+----------------+----------------+---------------+
|person_id|name            |graduate_program|spark_status   |
+---------+----------------+----------------+---------------+
|0        |Bill Chambers   |0               |[100]          |
|1        |Matei Zaharia   |1               |[500, 250, 100]|
|2        |Michael Armbrust|1               |[250, 100]     |
+---------+----------------+----------------+---------------+

None
+----------------+-------+---------------------+-----------+
|graduate_program|degree |department           |school     |
+----------------+-------+---------------------+-----------+
|0               |Masters|School of Information|UC Berkeley|
|2               |Masters|EECS                 |UC Berkeley|
|1               |Ph.D.  |EECS                 |UC Berkeley|
+----------------+-------+---------------------+-----------+

None


In [42]:
person.join(graduate_programDup,person.graduate_program==graduate_programDup.graduate_program).show()



+---------+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|person_id|            name|graduate_program|   spark_status|graduate_program| degree|          department|     school|
+---------+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|        0|   Bill Chambers|               0|          [100]|               0|Masters|School of Informa...|UC Berkeley|
|        1|   Matei Zaharia|               1|[500, 250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
|        2|Michael Armbrust|               1|     [250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
+---------+----------------+----------------+---------------+----------------+-------+--------------------+-----------+



In [44]:
person.join(graduate_programDup,"graduate_program").select("graduate_program").show()



+----------------+
|graduate_program|
+----------------+
|               0|
|               1|
|               1|
+----------------+

