http://localhost:8888/edit/Mastering-Big-Data-Analytics-with-PySpark/Section%203%20-%20Preparing%20Data%20using%20SparkSQL/3.4/join_types.py

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, StructType, StructField


spark = SparkSession.builder.appName("join_tests").getOrCreate()
schema = StructType(
    [StructField("id", IntegerType()), 
     StructField("value", StringType())
    ]
)

In [3]:
A = spark.createDataFrame(
    schema=schema, data=[
        (1, "A"),
        (2, "B"),
        (3, "C"),
        (4, "D"),
        (5, "E"),
        (None, "Z")
    ]
)

B = spark.createDataFrame(
    schema=schema, data=[
        (3, "C"),
        (4, "D"),
        (5, "E"),
        (6, "F"),
        (7, "G")
    ]
)

In [4]:
A.show()

+----+-----+
|  id|value|
+----+-----+
|   1|    A|
|   2|    B|
|   3|    C|
|   4|    D|
|   5|    E|
|null|    Z|
+----+-----+



In [5]:
# INNER JOINS
A.join(B, ["id"], "inner").show()

+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  3|    C|    C|
|  5|    E|    E|
|  4|    D|    D|
+---+-----+-----+



In [6]:
# CROSS JOINS (CARTESIAN PRODUCT)
A.crossJoin(B).show()

+---+-----+---+-----+
| id|value| id|value|
+---+-----+---+-----+
|  1|    A|  3|    C|
|  1|    A|  4|    D|
|  1|    A|  5|    E|
|  1|    A|  6|    F|
|  1|    A|  7|    G|
|  2|    B|  3|    C|
|  3|    C|  3|    C|
|  2|    B|  4|    D|
|  3|    C|  4|    D|
|  2|    B|  5|    E|
|  3|    C|  5|    E|
|  2|    B|  6|    F|
|  2|    B|  7|    G|
|  3|    C|  6|    F|
|  3|    C|  7|    G|
|  4|    D|  3|    C|
|  4|    D|  4|    D|
|  4|    D|  5|    E|
|  4|    D|  6|    F|
|  4|    D|  7|    G|
+---+-----+---+-----+
only showing top 20 rows



In [7]:
# FULL JOINS
A.join(B, ["id"], "outer").show()

+----+-----+-----+
|  id|value|value|
+----+-----+-----+
|null|    Z| null|
|   1|    A| null|
|   6| null|    F|
|   3|    C|    C|
|   5|    E|    E|
|   4|    D|    D|
|   7| null|    G|
|   2|    B| null|
+----+-----+-----+



In [8]:
A.join(B, ["id"], "full").show()

+----+-----+-----+
|  id|value|value|
+----+-----+-----+
|null|    Z| null|
|   1|    A| null|
|   6| null|    F|
|   3|    C|    C|
|   5|    E|    E|
|   4|    D|    D|
|   7| null|    G|
|   2|    B| null|
+----+-----+-----+



In [9]:
A.join(B, ["id"], "full_outer").show()

+----+-----+-----+
|  id|value|value|
+----+-----+-----+
|null|    Z| null|
|   1|    A| null|
|   6| null|    F|
|   3|    C|    C|
|   5|    E|    E|
|   4|    D|    D|
|   7| null|    G|
|   2|    B| null|
+----+-----+-----+



In [10]:
# LEFT OUTER
A.join(B, ["id"], "left").show()

+----+-----+-----+
|  id|value|value|
+----+-----+-----+
|null|    Z| null|
|   1|    A| null|
|   3|    C|    C|
|   5|    E|    E|
|   4|    D|    D|
|   2|    B| null|
+----+-----+-----+



In [11]:
A.join(B, ["id"], "left_outer").show()

+----+-----+-----+
|  id|value|value|
+----+-----+-----+
|null|    Z| null|
|   1|    A| null|
|   3|    C|    C|
|   5|    E|    E|
|   4|    D|    D|
|   2|    B| null|
+----+-----+-----+



In [12]:
# RIGHT OUTER
A.join(B, ["id"], "right").show()

+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  6| null|    F|
|  3|    C|    C|
|  5|    E|    E|
|  4|    D|    D|
|  7| null|    G|
+---+-----+-----+



In [13]:
A.join(B, ["id"], "right_outer").show()

+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  6| null|    F|
|  3|    C|    C|
|  5|    E|    E|
|  4|    D|    D|
|  7| null|    G|
+---+-----+-----+



In [14]:
# LEFT SPECIAL
A.join(B, ["id"], "left_semi").show()

+---+-----+
| id|value|
+---+-----+
|  3|    C|
|  5|    E|
|  4|    D|
+---+-----+



In [15]:
A.join(B, ["id"], "left_anti").show()

+----+-----+
|  id|value|
+----+-----+
|null|    Z|
|   1|    A|
|   2|    B|
+----+-----+



In [16]:
!pwd

/home/wengong/projects/py4kids/lesson-70-apache
