In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = (
    SparkSession.builder.appName("PySpark with MinIO, Delta, and Hive")
    .master("spark://spark-master:7077")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config(
        "spark.jars",
        "/opt/spark/jars/hadoop-aws-3.3.4.jar,"
        "/opt/spark/jars/s3-2.18.41.jar,"
        "/opt/spark/jars/aws-java-sdk-1.12.367.jar,"
        "/opt/spark/jars/delta-core_2.12-2.4.0.jar,"
        "/opt/spark/jars/delta-storage-2.4.0.jar,"
        "/opt/spark/jars/aws-java-sdk-bundle-1.12.367.jar,",
    )
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.warehouse.dir", "s3a://warehouse/")
    .config("spark.eventLog.enabled", "false")
    .getOrCreate()
)


/opt/spark/bin/load-spark-env.sh: line 68: ps: command not found
24/10/13 14:24:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# Paths for storing Delta tables in MinIO
batch_business = "s3a://deltalake/business"
batch_user = "s3a://deltalake/user"
batch_check_in = "s3a://deltalake/check_in"
batch_tip = "s3a://deltalake/tip"

In [4]:
# Reading JSON data and saving as Delta tables
businessDF = spark.read.json("s3a://data/yelp_academic_dataset_business.json")
businessDF.write.format("delta").mode("overwrite").save(batch_business)

userDF = spark.read.json("s3a://data/yelp_academic_dataset_user.json")
userDF.write.format("delta").mode("overwrite").save(batch_user)

check_inDF = spark.read.json("s3a://data/yelp_academic_dataset_checkin.json")
check_inDF.write.format("delta").mode("overwrite").save(batch_check_in)

tipDF = spark.read.json("s3a://data/yelp_academic_dataset_tip.json")
tipDF.write.format("delta").mode("overwrite").save(batch_tip)

24/10/13 15:16:25 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/10/13 15:16:53 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
# Create Delta tables in Hive Metastore using PySpark SQL
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS business
    USING DELTA
    LOCATION '{batch_business}'
"""
)

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS user
    USING DELTA
    LOCATION '{batch_user}'
"""
)

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS checkin
    USING DELTA
    LOCATION '{batch_check_in}'
"""
)

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS tip
    USING DELTA
    LOCATION '{batch_tip}'
"""
)

DataFrame[]

In [6]:
# Confirm tables are created by listing them
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default| business|      false|
|  default|  checkin|      false|
|  default|      tip|      false|
|  default|     user|      false|
+---------+---------+-----------+



In [8]:
spark.stop()