In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
file_location = "/FileStore/tables/customers_refined.csv"
file_type = "csv"

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS sales

In [0]:
%sql
CREATE OR REPLACE TABLE sales.customer_loyalty_hist_details
(
    row_id INT,
    customer_id STRING,
    customer_name STRING,
    region STRING,
    province STRING,
    customer_since INT,
    total_years INT,
    total_orders_count INT,
    loyalty_points INT,
    loyalty_type STRING,
    start_date DATE,
    end_date DATE
)
USING DELTA
LOCATION '/FileStore/tables/delta-table-merge/customer_loyalty_hist_details';

In [0]:
%sql
truncate table sales.customer_loyalty_hist_details;

In [0]:
from delta.tables import *

customer_loyalty_details_hist_instance = DeltaTable.forPath(spark, "/FileStore/tables/delta-table-merge/customer_loyalty_hist_details")
print(type(customer_loyalty_details_hist_instance))

<class 'delta.tables.DeltaTable'>


In [0]:
customer_loyalty_details_hist_instance.toDF().show(3,truncate=False)

+------+-----------+-------------+------+--------+--------------+-----------+------------------+--------------+------------+----------+--------+
|row_id|customer_id|customer_name|region|province|customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|start_date|end_date|
+------+-----------+-------------+------+--------+--------------+-----------+------------------+--------------+------------+----------+--------+
+------+-----------+-------------+------+--------+--------------+-----------+------------------+--------------+------------+----------+--------+



In [0]:
df = spark.read.format('delta') \
  .load(file_location)

In [0]:
df = df.withColumn("row_number", row_number().over(
    Window.partitionBy("customer_id").orderBy(desc("order_date"))
    )
                                   )

In [0]:
df = df.filter("row_number = 1").drop("row_number")

In [0]:
df.show(5,truncate=False)

+------+--------+-------------------+--------------+--------------+-------+--------+--------------+--------+----------+-------------+----------------+----------------+------+----------------+----------------+--------------------+-----------------------------------------+-----------------+----------+--------------+------------+----------------------------------------------------------------+------------------+--------------+-----------+
|row_id|order_id|order_date         |order_priority|order_quantity|sales  |discount|ship_mode     |profit  |unit_price|shipping_cost|customer_name   |province        |region|customer_segment|product_category|product_sub_category|product_name                             |product_container|ship_date |loyalty_points|loyalty_type|customer_id                                                     |total_orders_count|customer_since|total_years|
+------+--------+-------------------+--------------+--------------+-------+--------+--------------+--------+----------+-

In [0]:
customers_loyalty_hist_df = df.select('customer_id', 'customer_name', 'region','province' ,'customer_since', 'total_years', 'total_orders_count','loyalty_points','loyalty_type')

In [0]:
customers_loyalty_hist_df = customers_loyalty_hist_df.withColumn("start_date", current_date()) \
               .withColumn("end_date", lit("9999-12-31"))

In [0]:
max_row_id = 0
# Add incremental row_id to new records
customers_loyalty_hist_df = customers_loyalty_hist_df.withColumn("row_id", row_number().over(Window.orderBy("customer_id")) + max_row_id)

In [0]:
customers_loyalty_hist_df.groupBy('customer_id').count().filter("count>1").show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
+-----------+-----+



In [0]:
customer_loyalty_details_hist_instance.alias("Target") \
    .merge(
        source=customers_loyalty_hist_df.alias("Source"),
        condition="Target.customer_id == Source.customer_id"
    ) \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
%sql
select * from sales.customer_loyalty_hist_details order by row_id limit 5

row_id,customer_id,customer_name,region,province,customer_since,total_years,total_orders_count,loyalty_points,loyalty_type,start_date,end_date
1,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,Nick Crebassa,Prarie,Saskachewan,2009,15,432,650,Classic,2024-05-26,9999-12-31
2,00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9,Barry Pond,Yukon,Yukon,2010,14,227,350,Classic,2024-05-26,9999-12-31
3,0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f,Giulietta Weimer,West,British Columbia,2009,15,492,800,Classic,2024-05-26,9999-12-31
4,018fc79fefdcb46768279300fb6e32f4e1e9f627ae99e3a672244d3052462e9b,Ashley Jarboe,Yukon,Yukon,2009,15,495,950,Classic,2024-05-26,9999-12-31
5,01a041b0a5fbe79fb524be166ed14fc9f58054716ce63e9db05a67d35f767527,Christy Brittain,Yukon,Yukon,2009,15,604,1500,Daimond,2024-05-26,9999-12-31


In [0]:
spark.sql('''select * from sales.customer_loyalty_hist_details order by row_id limit 5''').show(5,truncate=False)

+------+----------------------------------------------------------------+----------------+------+----------------+--------------+-----------+------------------+--------------+------------+----------+----------+
|row_id|customer_id                                                     |customer_name   |region|province        |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|start_date|end_date  |
+------+----------------------------------------------------------------+----------------+------+----------------+--------------+-----------+------------------+--------------+------------+----------+----------+
|1     |001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa   |Prarie|Saskachewan     |2009          |15         |432               |650           |Classic     |2024-05-26|9999-12-31|
|2     |00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9|Barry Pond      |Yukon |Yukon           |2010          |14         |227            