In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, current_date, col

# Create Spark session with Hudi configuration
spark = SparkSession.builder \
    .appName("HudiTest") \
    .config("spark.jars", "/opt/spark/jars/hudi-spark3-bundle_2.12-1.0.0.jar") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED") \
    .config("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED") \
    .master("spark://spark-master:7077")\
    .config("spark.executor.cores", "2")\
    .getOrCreate()

spark


In [2]:
df = spark.range(0, 16).withColumn("date", current_date()).withColumn("timestamp", current_timestamp())
df.show()

+---+----------+--------------------+
| id|      date|           timestamp|
+---+----------+--------------------+
|  0|2025-05-22|2025-05-22 17:19:...|
|  1|2025-05-22|2025-05-22 17:19:...|
|  2|2025-05-22|2025-05-22 17:19:...|
|  3|2025-05-22|2025-05-22 17:19:...|
|  4|2025-05-22|2025-05-22 17:19:...|
|  5|2025-05-22|2025-05-22 17:19:...|
|  6|2025-05-22|2025-05-22 17:19:...|
|  7|2025-05-22|2025-05-22 17:19:...|
|  8|2025-05-22|2025-05-22 17:19:...|
|  9|2025-05-22|2025-05-22 17:19:...|
| 10|2025-05-22|2025-05-22 17:19:...|
| 11|2025-05-22|2025-05-22 17:19:...|
| 12|2025-05-22|2025-05-22 17:19:...|
| 13|2025-05-22|2025-05-22 17:19:...|
| 14|2025-05-22|2025-05-22 17:19:...|
| 15|2025-05-22|2025-05-22 17:19:...|
+---+----------+--------------------+



In [3]:
hudi_options = {
    'hoodie.table.name': 'my_table',
    'hoodie.datasource.write.recordkey.field': 'id',                           # unique identifier for records
    'hoodie.datasource.write.partitionpath.field': 'date',                     # partition column (must exist in df)
    'hoodie.datasource.write.precombine.field': 'timestamp',                   # used to deduplicate newer rows
    'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.SimpleKeyGenerator',
    'hoodie.datasource.write.operation': 'upsert',                             # or "insert" for first-time load
    'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',                     # or MERGE_ON_READ
    'hoodie.datasource.hive_sync.enable': 'false',                             # turn off Hive sync for now
    'hoodie.datasource.write.hive_style_partitioning': 'true',                 # optional: partitions as date=2024-05-21
}


In [4]:

# Write data
df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("append") \
    .save("hdfs://namenode:9000/data/hudi/my_table")

In [5]:
df_read = spark.read.format("hudi").load("hdfs://namenode:9000/data/hudi/my_table")
df_read.show()

+-------------------+--------------------+------------------+----------------------+--------------------+---+----------+--------------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| id|      date|           timestamp|
+-------------------+--------------------+------------------+----------------------+--------------------+---+----------+--------------------+
|  20250522171915221|20250522171915221...|                 7|       date=2025-05-22|179eb18e-41c1-438...|  7|2025-05-22|2025-05-22 17:19:...|
|  20250522171915221|20250522171915221...|                 3|       date=2025-05-22|179eb18e-41c1-438...|  3|2025-05-22|2025-05-22 17:19:...|
|  20250522171915221|20250522171915221...|                14|       date=2025-05-22|179eb18e-41c1-438...| 14|2025-05-22|2025-05-22 17:19:...|
|  20250522171915221|20250522171915221...|                10|       date=2025-05-22|179eb18e-41c1-438...| 10|2025-05-22|2025-05-22 17:19:...|
|  202

In [6]:
df_read = spark.read.format("hudi").load("hdfs://namenode:9000/data/hudi/ev_data_cleaned")

In [7]:
df_read

DataFrame[_hoodie_commit_time: string, _hoodie_commit_seqno: string, _hoodie_record_key: string, _hoodie_partition_path: string, _hoodie_file_name: string, vehical_number: string, county: string, city: string, state: string, postal_code: int, model_year: int, make: string, model: string, vehical_type: string, cavf_eligibility: string, electric_range: int, base_msrp: int, legislative_district: int, electric_utility: string, longitude: double, latitude: double, event_time: timestamp]