In [45]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

from datetime import datetime


## Create Dataframe and define Schema

In [46]:
InputData = [
    (1,'Prasad Nadig', 25, 'NJ','2022-01-01', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (2,'Ethereum', 80, 'NY', '2022-01-02', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (3,'Cosmos', 25, 'PA', '2022-01-03', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (4,'Solana', 55, 'MD', '2022-01-04', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (5,'Carnado', 15, 'TX', '2022-01-05', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")),
    (6,'Link', 45, 'NJ', '2022-01-06', datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S"))
]

#Define schema for the source data
schema = StructType([ \
    StructField("cust_id",IntegerType(),True), \
    StructField("cust_name",StringType(),True), \
    StructField("cust_age",IntegerType(),True), \
    StructField("cust_loc",StringType(),True), \
    StructField("create_date", StringType(), True), \
    StructField("last_updated_time", TimestampType(), True)
  ])

#Create dataframe from the input data and the corresponding schema
inputDF = spark.createDataFrame(data=InputData,schema=schema)

In [47]:
#Check data 
inputDF.show()

+-------+------------+--------+--------+-----------+-------------------+
|cust_id|   cust_name|cust_age|cust_loc|create_date|  last_updated_time|
+-------+------------+--------+--------+-----------+-------------------+
|      1|Prasad Nadig|      25|      NJ| 2022-01-01|2022-11-10 12:42:11|
|      2|    Ethereum|      80|      NY| 2022-01-02|2022-11-10 12:42:11|
|      3|      Cosmos|      25|      PA| 2022-01-03|2022-11-10 12:42:11|
|      4|      Solana|      55|      MD| 2022-01-04|2022-11-10 12:42:11|
|      5|     Carnado|      15|      TX| 2022-01-05|2022-11-10 12:42:11|
|      6|        Link|      45|      NJ| 2022-01-06|2022-11-10 12:42:11|
+-------+------------+--------+--------+-----------+-------------------+



## Define HUDI options, write data to S3 as HUDI dataset

In [48]:
hudiOptions = {
'hoodie.table.name': 'customer',
'hoodie.datasource.write.recordkey.field': 'cust_id',
'hoodie.datasource.write.partitionpath.field': 'create_date',
'hoodie.datasource.write.precombine.field': 'last_updated_time',
'hoodie.datasource.hive_sync.enable': 'true',
'hoodie.datasource.hive_sync.use_jdbc': 'false',
'hoodie.datasource.hive_sync.mode':'hms',
'hoodie.datasource.hive_sync.table': 'customer',
'hoodie.datasource.hive_sync.partition_fields': 'last_updated_time',
'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor'
}

In [49]:
inputDF.write \
.format('org.apache.hudi') \
.option('hoodie.datasource.write.operation', 'insert') \
.options(**hudiOptions) \
.mode('overwrite') \
.save('s3://emr-studio-emr-on-eks/hudi-tables/')

In [None]:
#datetime.strptime(datetime.now().strftime("%Y-%d-%m %H:%M:%S"), "%Y-%d-%m %H:%M:%S")