In [None]:
import findspark
findspark.init()

In [None]:
"""
Configure before creating SparkSession
"""

from pyspark.conf import SparkConf

config = SparkConf()
conf = \
(
    config
    .setMaster("spark://192.168.11.77:7077").setAppName("SparkDB")
    .set("spark.executor.memory", "2g")
    .set("spark.executor.cores", 4)
    .set("spark.cores.max", 4)
    .set("spark.driver.memory", "2g")
    .set("hive.metastore.uris", "thrift://192.168.93.128:9083")
    .set("hive.metastore.warehouse.dir", "hdfs://192.168.93.128:9000/user/hive/warehouse")
#     .set("spark.local.dir", "C:/spark-temp")     # TEMPORARY HARDCODING: Spark's temp
#     .set("spark.sql.warehouse.dir", "C:/spark")  # TEMPORARY HARDCODING: Spark's warehouse
)
"""
Ensure $HIVE_HOME/hive-site.xml is configured with 
hiave.metastore.uris
+
hive.metastore.warehouse.dir
"""


from pyspark.sql import SparkSession

ss = (
    SparkSession
    .builder
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()
)

In [None]:
products = (
    # (product_id, product_name, brand_id)  
    (1, 'iPhone', 100),
    (2, 'Galaxy', 200),
    (3, 'RedMi', 300),  # orphan record, no matching brand
    (4, 'Pixel', 400),
)
brands = (
    #(brand_id, brand_name)
    (100, "Apple"),
    (200, "Samsung"),
    (400, "Google"),
    (500, "Sony"),  # no matching product
)
 
df_product = ss.createDataFrame(data=products, schema=("product_id", "product_name", "brand_id"))
df_brand = ss.createDataFrame(data=brands, schema=("brand_id", "brand_name"))

df_product.show(), df_brand.show()

---

In [None]:
"""
WRITE DF into DB table
"""


"""
Writes to `default` DB in the metastore_db --> (Writes to C:/spark/brand_test/ (.parquet files))
"""
df_brand.write.mode("overwrite").saveAsTable("brand_test")

"""
Writes to the specified database --> (Writes to C:/spark/product_db/brand/ (.parquet files))
Creates table in Hive's metastore_db
"""
df_brand.write.mode("overwrite").saveAsTable("product_db.brand")

In [None]:
ss.sql("SELECT * FROM product_db.brand").show()

In [None]:
help(df_brand.write.mode)

In [None]:
df_brand.getNumPartitions()

---