In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

from datetime import datetime


## Create Dataframe and define Schema

In [46]:
InputData = [
    (1,'Prasad Nadig', 25, 'NJ','2022-01-01', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (2,'Ethereum', 80, 'NY', '2022-01-02', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (3,'Cosmos', 25, 'PA', '2022-01-03', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (4,'Solana', 55, 'MD', '2022-01-04', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (5,'Carnado', 15, 'TX', '2022-01-05', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (6,'Link', 45, 'NJ', '2022-01-06', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S"))
]

#Define schema for the source data
schema = StructType([ \
    StructField("cust_id",IntegerType(),True), \
    StructField("cust_name",StringType(),True), \
    StructField("cust_age",IntegerType(),True), \
    StructField("cust_loc",StringType(),True), \
    StructField("create_date", StringType(), True), \
    StructField("last_updated_time", TimestampType(), True)
  ])

#Create dataframe from the input data and the corresponding schema
inputDF = spark.createDataFrame(data=InputData,schema=schema)

In [47]:
#Check data 
inputDF.show()

+-------+------------+--------+--------+-----------+-------------------+
|cust_id|   cust_name|cust_age|cust_loc|create_date|  last_updated_time|
+-------+------------+--------+--------+-----------+-------------------+
|      1|Prasad Nadig|      25|      NJ| 2022-01-01|2022-11-10 12:42:11|
|      2|    Ethereum|      80|      NY| 2022-01-02|2022-11-10 12:42:11|
|      3|      Cosmos|      25|      PA| 2022-01-03|2022-11-10 12:42:11|
|      4|      Solana|      55|      MD| 2022-01-04|2022-11-10 12:42:11|
|      5|     Carnado|      15|      TX| 2022-01-05|2022-11-10 12:42:11|
|      6|        Link|      45|      NJ| 2022-01-06|2022-11-10 12:42:11|
+-------+------------+--------+--------+-----------+-------------------+



## Define HUDI options, write data to S3 as HUDI dataset

In [7]:
hudiOptions = {
'hoodie.table.name': 'customer',
'hoodie.datasource.write.recordkey.field': 'cust_id',
'hoodie.datasource.write.partitionpath.field': 'create_date',
'hoodie.datasource.write.precombine.field': 'last_updated_time',
'hoodie.datasource.hive_sync.enable': 'true',
'hoodie.datasource.hive_sync.use_jdbc': 'false',
'hoodie.datasource.hive_sync.mode':'hms',
'hoodie.datasource.hive_sync.table': 'customer',
'hoodie.datasource.hive_sync.partition_fields': 'last_updated_time',
'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor'
}

In [49]:
inputDF.write \
.format('org.apache.hudi') \
.option('hoodie.datasource.write.operation', 'insert') \
.options(**hudiOptions) \
.mode('overwrite') \
.save('s3://emr-studio-emr-on-eks/hudi-tables/')

In [None]:
#datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")

## Read data from HUDI Dataset we just created

In [3]:
# By default HUDI performs snapshot queries. 
snapshotQueryDF = spark.read \
    .format('org.apache.hudi') \
    .load('s3://emr-studio-emr-on-eks/hudi-tables' + '/*/*')
    
snapshotQueryDF.select("cust_id", "cust_name", "cust_age", "cust_loc", "create_date", "last_updated_time").orderBy("cust_id").show()

+-------+------------+--------+--------+-----------+-------------------+
|cust_id|   cust_name|cust_age|cust_loc|create_date|  last_updated_time|
+-------+------------+--------+--------+-----------+-------------------+
|      2|    Ethereum|      80|      NY| 2022-01-02|2022-11-10 12:42:11|
|      1|Prasad Nadig|      25|      NJ| 2022-01-01|2022-11-10 12:42:11|
|      3|      Cosmos|      25|      PA| 2022-01-03|2022-11-10 12:42:11|
|      4|      Solana|      55|      MD| 2022-01-04|2022-11-10 12:42:11|
|      5|     Carnado|      15|      TX| 2022-01-05|2022-11-10 12:42:11|
|      6|        Link|      45|      NJ| 2022-01-06|2022-11-10 12:42:11|
+-------+------------+--------+--------+-----------+-------------------+



# DML Operations

## UPSERT
###  - HUDI write opearation provides 3 options Upsert/Insert and Bulk Insert, we did Insert in the previous steps, now lets try the upsert operation

In [6]:
# We will update an existing record and insert a new record. Upsert operation in HUDI will find the record based on the RecordKey, if found it will update the value, if not found then will Insert the record.
InputData = [
    (1,'Prasad S Nadig', 30, 'NJ','2022-01-01', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")), #Update
    (7,'Compound', 20, 'NJ', '2022-01-07', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")) #Insert
]

#Define schema for the source data
schema = StructType([ \
    StructField("cust_id",IntegerType(),True), \
    StructField("cust_name",StringType(),True), \
    StructField("cust_age",IntegerType(),True), \
    StructField("cust_loc",StringType(),True), \
    StructField("create_date", StringType(), True), \
    StructField("last_updated_time", TimestampType(), True)
  ])

#Create dataframe from the input data and the corresponding schema
updateDF = spark.createDataFrame(data=InputData,schema=schema)

In [8]:
#Now we will Update/Insert the data to HUDI dataset on S3, instead of insert, we will use "upsert" and instead of overwrite for mode, we will use "append"

updateDF.write \
.format('org.apache.hudi') \
.option('hoodie.datasource.write.operation', 'upsert') \
.options(**hudiOptions) \
.mode('append') \
.save('s3://emr-studio-emr-on-eks/hudi-tables/')

In [9]:
# Lets check the HUDI dataset if the record was updated and the new record was inserted or not
# You should see cust_name and cust_age for cust_i=1 is updated and a new record cust_id=7 is inserted.
#also notice that the last_updated_time is also updated for cust_id=1

snapshotQueryDF = spark.read \
    .format('org.apache.hudi') \
    .load('s3://emr-studio-emr-on-eks/hudi-tables' + '/*/*')
    
snapshotQueryDF.select("cust_id", "cust_name", "cust_age", "cust_loc", "create_date", "last_updated_time").orderBy("cust_id").show()

+-------+--------------+--------+--------+-----------+-------------------+
|cust_id|     cust_name|cust_age|cust_loc|create_date|  last_updated_time|
+-------+--------------+--------+--------+-----------+-------------------+
|      1|Prasad S Nadig|      30|      NJ| 2022-01-01|2022-11-10 15:34:46|
|      2|      Ethereum|      80|      NY| 2022-01-02|2022-11-10 12:42:11|
|      3|        Cosmos|      25|      PA| 2022-01-03|2022-11-10 12:42:11|
|      4|        Solana|      55|      MD| 2022-01-04|2022-11-10 12:42:11|
|      5|       Carnado|      15|      TX| 2022-01-05|2022-11-10 12:42:11|
|      6|          Link|      45|      NJ| 2022-01-06|2022-11-10 12:42:11|
|      7|      Compound|      20|      NJ| 2022-01-07|2022-11-10 15:34:46|
+-------+--------------+--------+--------+-----------+-------------------+



## DELETE

In [10]:
#HUDI alllows you to delete records just like traditional RDBMS, so let's delete a record
deleteDF = snapshotQueryDF.where("cust_id==6")

In [11]:
#Write to HUDI dataset to apply the deletes
deleteDF.write \
.format('org.apache.hudi') \
.option("hoodie.datasource.write.payload.class", "org.apache.hudi.common.model.EmptyHoodieRecordPayload") \
.options(**hudiOptions) \
.mode('append') \
.save('s3://emr-studio-emr-on-eks/hudi-tables/')

In [1]:
#Notice that cust_id=6 has been permanently deleted from the dataset
deleteReadDF = spark.read \
    .format('org.apache.hudi') \
    .load('s3://emr-studio-emr-on-eks/hudi-tables' + '/*/*')
    
deleteReadDF.select("cust_id", "cust_name", "cust_age", "cust_loc", "create_date", "last_updated_time").orderBy("cust_id").show()


+-------+--------------+--------+--------+-----------+-------------------+
|cust_id|     cust_name|cust_age|cust_loc|create_date|  last_updated_time|
+-------+--------------+--------+--------+-----------+-------------------+
|      1|Prasad S Nadig|      30|      NJ| 2022-01-01|2022-11-10 15:34:46|
|      2|      Ethereum|      80|      NY| 2022-01-02|2022-11-10 12:42:11|
|      3|        Cosmos|      25|      PA| 2022-01-03|2022-11-10 12:42:11|
|      4|        Solana|      55|      MD| 2022-01-04|2022-11-10 12:42:11|
|      5|       Carnado|      15|      TX| 2022-01-05|2022-11-10 12:42:11|
|      7|      Compound|      20|      NJ| 2022-01-07|2022-11-10 15:34:46|
+-------+--------------+--------+--------+-----------+-------------------+

