In [1]:
import findspark
findspark.init()

In [2]:
"""
Configure before creating SparkSession
"""

from pyspark.conf import SparkConf

config = SparkConf()
conf = \
(
    config
    .setMaster("spark://192.168.11.77:7077").setAppName("SparkDB")
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", 4)
    .set("spark.cores.max", 4)
    .set("spark.driver.memory", "2g")
    .set("spark.local.dir", "C:/spark-temp")     # TEMPORARY HARDCODING: Spark's temp
    .set("spark.sql.warehouse.dir", "C:/spark")  # TEMPORARY HARDCODING: Spark's warehouse
)


from pyspark.sql import SparkSession

ss = (
    SparkSession
    .builder
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()
)

In [3]:
products = (
    # (product_id, product_name, brand_id)  
    (1, 'iPhone', 100),
    (2, 'Galaxy', 200),
    (3, 'RedMi', 300),  # orphan record, no matching brand
    (4, 'Pixel', 400),
)
brands = (
    #(brand_id, brand_name)
    (100, "Apple"),
    (200, "Samsung"),
    (400, "Google"),
    (500, "Sony"),  # no matching product
)
 
df_product = ss.createDataFrame(data=products, schema=("product_id", "product_name", "brand_id"))
df_brand = ss.createDataFrame(data=brands, schema=("brand_id", "brand_name"))

df_product.show(), df_brand.show()

+----------+------------+--------+
|product_id|product_name|brand_id|
+----------+------------+--------+
|         1|      iPhone|     100|
|         2|      Galaxy|     200|
|         3|       RedMi|     300|
|         4|       Pixel|     400|
+----------+------------+--------+

+--------+----------+
|brand_id|brand_name|
+--------+----------+
|     100|     Apple|
|     200|   Samsung|
|     400|    Google|
|     500|      Sony|
+--------+----------+



(None, None)

---

In [5]:
"""
WRITE DF into DB table
"""


"""
Writes to `default` DB in the metastore_db --> (Writes to C:/spark/brand_test/ (.parquet files))
"""
df_brand.write.mode("overwrite").saveAsTable("brand_test")

"""
Writes to the specified database --> (Writes to C:/spark/product_db/brand/ (.parquet files))
Creates table in Hive's metastore_db
"""
df_brand.write.mode("overwrite").saveAsTable("product_db.brand")

In [6]:
ss.sql("SELECT * FROM product_db.brand").show()

+--------+----------+
|brand_id|brand_name|
+--------+----------+
|     200|   Samsung|
|     400|    Google|
|     100|     Apple|
|     500|      Sony|
+--------+----------+



In [7]:
help(df_brand.write.mode)

Help on method mode in module pyspark.sql.readwriter:

mode(saveMode) method of pyspark.sql.readwriter.DataFrameWriter instance
    Specifies the behavior when data or table already exists.
    
    Options include:
    
    * `append`: Append contents of this :class:`DataFrame` to existing data.
    * `overwrite`: Overwrite existing data.
    * `error` or `errorifexists`: Throw an exception if data already exists.
    * `ignore`: Silently ignore this operation if data already exists.
    
    >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
    
    .. versionadded:: 1.4



---

DELETE BELOW

In [None]:
"""
INNER JOIN
df_product (left side of the join) JOIN df_brand (right side of the join)
"""

df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], how="inner").show()

In [None]:
"""
Same as above.
Common join column ca be specified once.
"""

df_product.join(df_brand, on="brand_id", how="inner").show()

---

In [None]:
"""
OUTER JOIN / FULL OUTER JOIN
Records from both the left + right DFs (whether matches or not)
"""

df_product.join(df_brand, on="brand_id", how="outer").show()

In [None]:
"""
LEFT / LEFT OUTER JOIN
All records from left DF.
Unmatched right DF entries appear as null.
"""

df_product.join(df_brand, on="brand_id", how="left").show()

In [None]:
"""
LEFT / LEFT OUTER JOIN
All records from left DF.
Unmatched right DF entries appear as null.
"""

df_product.join(df_brand, on="brand_id", how="leftouter").show()

In [None]:
"""
RIGHT / RIGHT OUTER JOIN
All records from right DF.
Unmatched left DF entries appear as null.
"""

df_product.join(df_brand, on="brand_id", how="rightouter").show()

In [None]:
"""
LEFT-SEMI JOIN
LEFT join with only left-columns retained.
"""

print("-- join(on) --")
df_product.join(df_brand, on="brand_id", how="leftsemi").show()

print("-- join(==) --")
df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], "leftsemi").show()

In [None]:
"""
LEFT-ANTI JOIN
Exact Opposite of LEFT-SEMI.
LEFT join returns left df records that don't have a right-df match!
"""

df_product.join(df_brand, on="brand_id", how="leftanti").show()
df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], "leftanti").show()

In [None]:
"""
CROSS JOIN
Exact Opposite of LEFT-SEMI.
LEFT join returns left df records that don't have a right-df match!
"""

# df_product.join(df_brand, on="brand_id", how="cross").show()
# df_product.join(df_brand, df_product["brand_id"] ==  df_brand["brand_id"], "cross").show()
df_product.crossJoin(df_brand).show()

In [None]:
help(df_product.join)