# Hudi Demo with DML

In [None]:
%%configure -f
{
    "conf":  { 
             "spark.jars":"hdfs:///user/hadoop/aws-java-sdk-bundle-1.12.31.jar,hdfs:///user/hadoop/httpcore-4.4.11.jar,hdfs:///user/hadoop/httpclient-4.5.9.jar,hdfs:////user/hadoop/hudi-spark-bundle.jar,hdfs:///user/hadoop/spark-avro.jar",
             "spark.sql.hive.convertMetastoreParquet":"false", 
             "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
             "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
           } 
}

### Create Tables

In [None]:
%%sql 

/****************************
Create a HUDI table having schema same as of Amazon customer reviews table containing selected columns 
*****************************/

create table if not exists amazon_customer_review_hudi
    ( marketplace string, 
      review_id string, 
      customer_id string,
      product_title string,
      star_rating int,
      timestamp long ,
      review_date date,
      year string,
      month string ,
      day string
      )
      using hudi
      location 's3://mrworkshop-youraccountID-dayone/my-hudi-dataset/'
      options ( 
      type = 'cow',  
      primaryKey = 'review_id', 
      preCombineField = 'timestamp',
      hoodie.datasource.write.hive_style_partitioning = 'true'
      )
      partitioned by (year,month,day);

In [None]:
%%sql 

/****************************
Create amazon_customer_review_parquet_merge_source  to be used as source for merging into amazon_customer_review_hudi.
The table contains deleteRecord column to track if *deletion* of record is needed
*****************************/

create table if not exists amazon_customer_review_parquet_merge_source 
       (
        marketplace string, 
        review_id string, 
        customer_id string,
        product_title string,
        star_rating int,
        review_date date,
        deleteRecord string
       )
       STORED AS PARQUET
       LOCATION 's3://mrworkshop-youraccountID-dayone/toBeMergeData/'

### Insert Data into Hudi tables

In [None]:
%%sql 

/****************************
 Insert a record into amazon_customer_review_parquet_merge_source for *deletion* 
*****************************/

-- The record will be deleted from amazon_customer_review_hudi after merge as deleteRecord  is set to yes

insert into amazon_customer_review_parquet_merge_source
    select
    'italy',
    '11',
    '1111',
    'table',
     5,
    TO_DATE(CAST(UNIX_TIMESTAMP('2015/05/02', 'yyyy/MM/dd') AS TIMESTAMP)) as  review_date,
    'yes'

In [None]:
%%sql
/****************************
 Insert a record into amazon_customer_review_parquet_merge_source used for *update*
*****************************/

-- The record will be updated from amazon_customer_review_hudi with new Star rating and product_title after merge

insert into amazon_customer_review_parquet_merge_source
    select
    'spain',
    '22',
    '2222',
    'Relaxing chair',
     4,
    TO_DATE(CAST(UNIX_TIMESTAMP('2015/05/02', 'yyyy/MM/dd') AS TIMESTAMP)) as  review_date,
    'no' 

In [None]:
%%sql
/****************************
 Insert a record into amazon_customer_review_parquet_merge_source for *insert* 
*****************************/

-- The record will be inserted into amazon_customer_review_hudi after merge 

insert into amazon_customer_review_parquet_merge_source
    select
    'uk',
    '33',
    '3333',
    'hanger',
     3,
    TO_DATE(CAST(UNIX_TIMESTAMP('2015/05/02', 'yyyy/MM/dd') AS TIMESTAMP)) as  review_date,
    'no' 

In [None]:
%%sql

select * from amazon_customer_review_parquet_merge_source

Let's insert record into our Hudi table

In [None]:
%%sql

/****************************
 Insert a record into amazon_customer_review_hudi table for *deletion* after merge 
*****************************/

-- Spark SQL date time functions https://spark.apache.org/docs/latest/api/sql/index.html#date_add

insert into amazon_customer_review_hudi 
    select 
    'italy',
    '11',
    '1111',
    'table',
     5,
    unix_timestamp(current_timestamp()) as timestamp,
    TO_DATE(CAST(UNIX_TIMESTAMP('2015/05/02', 'yyyy/MM/dd') AS TIMESTAMP)) as  review_date,
    date_format(date '2015-05-02', "yyyy") as year, 
    date_format(date '2015-05-02', "MM") as month,
    date_format(date '2015-05-02', "dd") as day  

In [None]:
%%sql
/****************************
 Insert a record into amazon_customer_review_hudi table for *update* after merge 
*****************************/

insert into  amazon_customer_review_hudi
    select 
    'spain',
    '22',
    '2222',
    'chair ',
     5,
    unix_timestamp(current_timestamp()) as timestamp,
    TO_DATE(CAST(UNIX_TIMESTAMP('2015/05/02', 'yyyy/MM/dd') AS TIMESTAMP)) as  review_date,
    date_format(date '2015-05-02', "yyyy") as year, 
    date_format(date '2015-05-02', "MM") as month,
    date_format(date '2015-05-02', "dd") as day  

In [None]:
%%sql 

select * from amazon_customer_review_hudi

### Merge into 

Considerations:

* The merge condition is possible only on primary keys 

```
on target.review_id = source.review_id 
```

* Support for partial updates is supported for COW table but not supported for MOR tables. 
* Target table's fields *cannot* be the right-value of the update expression for Merge-On-Read table. The  update will result in an error as target columns are present on right handside of the expression

```
update set target.star_rating =  target.star_rating +1 
```



In [None]:
%%sql 

/*************************************
MergeInto : Merge Source Into Traget 
**************************************/

-- Source amazon_customer_review_parquet_merge_source 
-- Taget amazon_customer_review_hudi

merge into amazon_customer_review_hudi as target
using ( 
        select
        marketplace, 
        review_id, 
        customer_id,
        product_title,
        star_rating,
        review_date,
        deleteRecord,
        date_format(review_date, "yyyy") as year,
        date_format(review_date, "MM") as month,
        date_format(review_date, "dd") as day
        from amazon_customer_review_parquet_merge_source ) source
on target.review_id = source.review_id 
when matched and deleteRecord != 'yes' then 

update set target.timestamp = unix_timestamp(current_timestamp()),  
target.star_rating = source.star_rating, 
target.product_title = source.product_title

when matched and deleteRecord = 'yes' then delete

when not matched then insert 
      ( target.marketplace, 
        target.review_id, 
        target.customer_id,
        target.product_title,
        target.star_rating,
        target.timestamp ,
        target.review_date,
        target.year ,
        target.month  ,
        target.day
      ) 
      values
      (
        source.marketplace,
        source.review_id, 
        source.customer_id,
        source.product_title,
        source.star_rating,
        unix_timestamp(current_timestamp()),
        source.review_date,
        source.year , 
        source.month ,
        source.day 
       )

In [None]:
%%sql 

select * from amazon_customer_review_hudi

### Schema Evolution

Hudi supports common schema evolution (https://hudi.apache.org/docs/0.9.0/schema_evolution) scenarios, such as adding a nullable field or promoting a datatype of a field, out-of-the-box. Lets add a new column *ssid*  (type int ) to amazon_customer_review_hudi  table and promote  the type from int to long

In [None]:
%%sql

/*************************************
Adding a new column name ssid of type int to amazon_customer_review_hudi table
**************************************/

ALTER TABLE amazon_customer_review_hudi ADD COLUMNS (ssid int)

In [None]:
%%sql 

select * from amazon_customer_review_hudi

In [None]:
%%sql

/*************************************
Updating ssid type from int to long in amazon_customer_review_hudi table
**************************************/   
  
ALTER TABLE amazon_customer_review_hudi CHANGE COLUMN ssid ssid long;

In [None]:
%%sql 

select * from amazon_customer_review_hudi