In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf

config = SparkConf()
config = config.setMaster("spark://192.168.11.77:7077").setAppName("JOINs:CLUSTER")


In [3]:
conf = \
(
    config
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", 4)
    .set("spark.cores.max", 4)
    .set("spark.driver.memory", "2g")
)

In [4]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.config(conf=conf).getOrCreate()

In [5]:
products = (
    # (product_id, product_name, brand_id)  
    (1, 'iPhone', 100),
    (2, 'Galaxy', 200),
    (3, 'RedMi', 300),  # orphan record, no matching brand
    (4, 'Pixel', 400),
)
brands = (
    #(brand_id, brand_name)
    (100, "Apple"),
    (200, "Samsung"),
    (400, "Google"),
    (500, "Sony"),  # no matching product
)
 
df_product = ss.createDataFrame(data=products, schema=("product_id", "product_name", "brand_id"))
df_brand = ss.createDataFrame(data=brands, schema=("brand_id", "brand_name"))

df_product.show(), df_brand.show()

+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         3|       RedMi|     300|
|         4|       Pixel|     400|
+----------+------------+--------+

+--------+----------+
|brand_id|brand_name|
+--------+----------+
|     100|     Apple|
|     200|   Samsung|
|     400|    Google|
|     500|      Sony|
+--------+----------+



(None, None)

---

In [6]:
"""
INNER JOIN
df_product (left side of the join) JOIN df_brand (right side of the join)
"""

df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], how="inner").show()

+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     200|   Samsung|
|         4|       Pixel|     400|     400|    Google|
+----------+------------+--------+--------+----------+



In [7]:
"""
Same as above.
Common join column ca be specified once.
"""

df_product.join(df_brand, on="brand_id", how="inner").show()

+--------+----------+------------+----------+
|brand_id|product_id|product_name|brand_name|
+--------+----------+------------+----------+
|     100|         1|      iPhone|     Apple|
|     200|         2|      Galaxy|   Samsung|
|     400|         4|       Pixel|    Google|
+--------+----------+------------+----------+



---

In [8]:
"""
OUTER JOIN / FULL OUTER JOIN
Records from both the left + right DFs (whether matches or not)
"""

df_product.join(df_brand, on="brand_id", how="outer").show()

+--------+----------+------------+----------+
|brand_id|product_id|product_name|brand_name|
+--------+----------+------------+----------+
|     500|      null|        null|      Sony|
|     100|         1|      iPhone|     Apple|
|     200|         2|      Galaxy|   Samsung|
|     400|         4|       Pixel|    Google|
|     300|         3|       RedMi|      null|
+--------+----------+------------+----------+



In [9]:
"""
LEFT / LEFT OUTER JOIN
All records from left DF.
Unmatched right DF entries appear as null.
"""

df_product.join(df_brand, on="brand_id", how="left").show()

+--------+----------+------------+----------+
|brand_id|product_id|product_name|brand_name|
+--------+----------+------------+----------+
|     100|         1|      iPhone|     Apple|
|     200|         2|      Galaxy|   Samsung|
|     400|         4|       Pixel|    Google|
|     300|         3|       RedMi|      null|
+--------+----------+------------+----------+



In [10]:
"""
LEFT / LEFT OUTER JOIN
All records from left DF.
Unmatched right DF entries appear as null.
"""

df_product.join(df_brand, on="brand_id", how="leftouter").show()

+--------+----------+------------+----------+
|brand_id|product_id|product_name|brand_name|
+--------+----------+------------+----------+
|     100|         1|      iPhone|     Apple|
|     200|         2|      Galaxy|   Samsung|
|     400|         4|       Pixel|    Google|
|     300|         3|       RedMi|      null|
+--------+----------+------------+----------+



In [11]:
"""
RIGHT / RIGHT OUTER JOIN
All records from right DF.
Unmatched left DF entries appear as null.
"""

df_product.join(df_brand, on="brand_id", how="rightouter").show()

+--------+----------+------------+----------+
|brand_id|product_id|product_name|brand_name|
+--------+----------+------------+----------+
|     500|      null|        null|      Sony|
|     100|         1|      iPhone|     Apple|
|     200|         2|      Galaxy|   Samsung|
|     400|         4|       Pixel|    Google|
+--------+----------+------------+----------+



In [12]:
"""
LEFT-SEMI JOIN
LEFT join with only left-columns retained.
"""

print("-- join(on) --")
df_product.join(df_brand, on="brand_id", how="leftsemi").show()

print("-- join(==) --")
df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], "leftsemi").show()

-- join(on) --
+--------+----------+------------+
|brand_id|product_id|product_name|
+--------+----------+------------+
|     100|         1|      iPhone|
|     200|         2|      Galaxy|
|     400|         4|       Pixel|
+--------+----------+------------+

-- join(==) --
+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         4|       Pixel|     400|
+----------+------------+--------+



In [13]:
"""
LEFT-ANTI JOIN
Exact Opposite of LEFT-SEMI.
LEFT join returns left df records that don't have a right-df match!
"""

df_product.join(df_brand, on="brand_id", how="leftanti").show()
df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], "leftanti").show()

+--------+----------+------------+
|brand_id|product_id|product_name|
+--------+----------+------------+
|     300|         3|       RedMi|
+--------+----------+------------+

+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         3|       RedMi|     300|
+----------+------------+--------+



In [22]:
"""
CROSS JOIN
Exact Opposite of LEFT-SEMI.
LEFT join returns left df records that don't have a right-df match!
"""

# df_product.join(df_brand, on="brand_id", how="cross").show()
# df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], "cross").show()
df_product.crossJoin(df_brand).show()

+----------+------------+--------+--------+----------+
|product_id|product_name|brand_id|brand_id|brand_name|
+----------+------------+--------+--------+----------+
|         1|      iPhone|     100|     100|     Apple|
|         2|      Galaxy|     200|     100|     Apple|
|         1|      iPhone|     100|     200|   Samsung|
|         2|      Galaxy|     200|     200|   Samsung|
|         1|      iPhone|     100|     400|    Google|
|         2|      Galaxy|     200|     400|    Google|
|         1|      iPhone|     100|     500|      Sony|
|         2|      Galaxy|     200|     500|      Sony|
|         3|       RedMi|     300|     100|     Apple|
|         4|       Pixel|     400|     100|     Apple|
|         3|       RedMi|     300|     200|   Samsung|
|         4|       Pixel|     400|     200|   Samsung|
|         3|       RedMi|     300|     400|    Google|
|         4|       Pixel|     400|     400|    Google|
|         3|       RedMi|     300|     500|      Sony|
|         

In [14]:
help(df_product.join)

Help on method join in module pyspark.sql.dataframe:

join(other, on=None, how=None) method of pyspark.sql.dataframe.DataFrame instance
    Joins with another :class:`DataFrame`, using the given join expression.
    
    :param other: Right side of the join
    :param on: a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``full_outer``, ``left``, ``left_outer``, ``right``, ``right_outer``,
        ``left_semi``, and ``left_anti``.
    
    The following performs a full outer join between ``df1`` and ``df2``.
    
    >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()
    [Row(name=None, height=80), Row(name='Bob'