In [0]:
file_location = "/FileStore/tables/customers_refined.csv"
file_type = "csv"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
df = spark.read.format('delta') \
  .load(file_location)

In [0]:
df.show(5,truncate=False)

+------+--------+-------------------+--------------+--------------+--------+--------+-----------+-------+----------+-------------+-------------+-----------+------+----------------+----------------+-----------------------------+-------------------------------------------------------+-----------------+----------+--------------+------------+----------------------------------------------------------------+------------------+--------------+-----------+
|row_id|order_id|order_date         |order_priority|order_quantity|sales   |discount|ship_mode  |profit |unit_price|shipping_cost|customer_name|province   |region|customer_segment|product_category|product_sub_category         |product_name                                           |product_container|ship_date |loyalty_points|loyalty_type|customer_id                                                     |total_orders_count|customer_since|total_years|
+------+--------+-------------------+--------------+--------------+--------+--------+-----------

In [0]:
df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_priority: string (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- sales: float (nullable = true)
 |-- discount: float (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- profit: float (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- shipping_cost: float (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)
 |-- ship_date: date (nullable = true)
 |-- loyalty_points: long (nullable = true)
 |-- loyalty_type: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- total_or

In [0]:
customers_loyalty_df = df.select('customer_id', 'customer_name', 'customer_since', 'total_years', 'total_orders_count','loyalty_points','loyalty_type','order_date')

In [0]:
customers_loyalty_df.show(5,truncate=False)

+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+-------------------+
|customer_id                                                     |customer_name|customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|order_date         |
+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+-------------------+
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|2009          |15         |432               |650           |Classic     |2012-09-30 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|2009          |15         |432               |650           |Classic     |2012-02-24 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|2009          |15         |432               

In [0]:
customers_loyalty_df.count()

Out[8]: 8399

In [0]:
customers_loyalty_df.show(5,truncate=False)

+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+-------------------+
|customer_id                                                     |customer_name|customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|order_date         |
+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+-------------------+
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|2009          |15         |432               |650           |Classic     |2012-09-30 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|2009          |15         |432               |650           |Classic     |2012-02-24 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|2009          |15         |432               

In [0]:
customers_loyalty_df.groupBy("customer_id").count().filter("count>1").show()

+--------------------+-----+
|         customer_id|count|
+--------------------+-----+
|0105946ed08fb4ccf...|   16|
|487e1c84ea398291f...|    5|
|dfff7e512bc9808c8...|   14|
|360d850c3d7945f54...|    2|
|3c2220b146abbc800...|    5|
|71171fca026b1e5ed...|   10|
|751f994a3d3b5b290...|    6|
|88e780296873c748a...|   10|
|b0b4faa3da57c2038...|    3|
|beacc4d65042807d4...|    9|
|d39dd257708e4e664...|    3|
|5aff1f6aac0864967...|    8|
|6992e80e67de3e022...|   20|
|7fea1e195dc74b43b...|   21|
|94fa84a56e1b77cf6...|    2|
|dd70309d73910b98d...|    7|
|ff79734b8b635c06e...|    9|
|1193e8010b51cd44c...|    3|
|075609fdf9c07ddef...|   12|
|41e6dec82a3754747...|   17|
+--------------------+-----+
only showing top 20 rows



In [0]:
# Need latest loyalty details against each customer not the history of loyalty assigned
customers_loyalty_df = customers_loyalty_df.withColumn("row_number", row_number().over(
    Window.partitionBy("customer_id").orderBy(desc("order_date"))
    )
                                   )

In [0]:
latest_customers_df = customers_loyalty_df.filter("row_number = 1").drop("row_number","order_date")

In [0]:
latest_customers_df.filter("customer_id = '07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a' ").show(truncate=False)

+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+
|customer_id                                                     |customer_name|customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|
+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp    |2009          |15         |408               |700           |Classic     |
+----------------------------------------------------------------+-------------+--------------+-----------+------------------+--------------+------------+



In [0]:
latest_customers_df.count()

Out[14]: 795

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS sales;

In [0]:
%sql
CREATE OR REPLACE TABLE sales.customer_loyalty_details
(
    customer_id STRING,
    customer_name STRING,
    customer_since INT,
    total_years INT,
    total_orders_count INT,
    loyalty_points INT,
    loyalty_type STRING
)
USING DELTA
LOCATION '/FileStore/tables/delta-table-merge/customer_loyalty_details';

In [0]:
%sql
select * from sales.customer_loyalty_details

customer_id,customer_name,customer_since,total_years,total_orders_count,loyalty_points,loyalty_type
001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,Nick Crebassa,2009,15,432,650,Classic
00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9,Barry Pond,2010,14,227,350,Classic
0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f,Giulietta Weimer,2009,15,492,800,Classic
018fc79fefdcb46768279300fb6e32f4e1e9f627ae99e3a672244d3052462e9b,Ashley Jarboe,2009,15,495,950,Classic
01a041b0a5fbe79fb524be166ed14fc9f58054716ce63e9db05a67d35f767527,Christy Brittain,2009,15,604,1500,Daimond
02c48608c84d4e55fff763c5fccac13bc5ceca8b7c7932cb8a826644840fc504,Bradley Talbott,2010,14,92,200,Classic
033169eeb2aa318446ae8e8c8095650da8de9c0a48d3ea967e1b30d8102a6751,Chris Selesnick,2009,15,153,300,Classic
0369ac17df128032c356845c2affa7514fec536015f9a12cd48e6adfc3067395,Yana Sorensen,2009,15,153,300,Classic
03cb105570054c13f9cc768fa1462134f4ddbc8077501f712e3be987506ac10a,Erica Smith,2009,15,375,750,Classic
03de3b10f1d1797fa42d5372db545dadd5aa777b823885979bdd09fa20e9660d,Jane Waco,2010,14,217,350,Classic


In [0]:
spark.sql(
'''
select * from sales.customer_loyalty_details
'''
).show(truncate=False)

+----------------------------------------------------------------+------------------+--------------+-----------+------------------+--------------+------------+
|customer_id                                                     |customer_name     |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|
+----------------------------------------------------------------+------------------+--------------+-----------+------------------+--------------+------------+
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa     |2009          |15         |432               |650           |Classic     |
|00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9|Barry Pond        |2010          |14         |227               |350           |Classic     |
|0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f|Giulietta Weimer  |2009          |15         |492               |800           |Classic     |
|018fc79fefdcb46768279300fb6e32f4e1e9f62

In [0]:
# creating an instance of delta table
from delta.tables import *

customer_loyalty_details_instance = DeltaTable.forPath(spark, "/FileStore/tables/delta-table-merge/customer_loyalty_details")
print(type(customer_loyalty_details_instance))

<class 'delta.tables.DeltaTable'>


In [0]:
customer_loyalty_details_instance.alias("Target") \
    .merge(
        source=latest_customers_df.alias("Source"),
        condition="Target.customer_id == Source.customer_id"
    ) \
    .whenMatchedUpdate(
        set={
            "customer_name": "Source.customer_name",
            "customer_since": "Source.customer_since",
            "total_years": "Source.total_years",
            "total_orders_count": "Source.total_orders_count",
             "loyalty_points": "Source.loyalty_points",
            "loyalty_type": "Source.loyalty_type"
        }
    ) \
    .whenNotMatchedInsert(
        values={
            "customer_id": "Source.customer_id",
             "customer_name": "Source.customer_name",
            "customer_since": "Source.customer_since",
            "total_years": "Source.total_years",
            "total_orders_count": "Source.total_orders_count",
             "loyalty_points": "Source.loyalty_points",
            "loyalty_type": "Source.loyalty_type"
        }
    ) \
    .execute()


In [0]:
%sql
select * from sales.customer_loyalty_details limit 5

customer_id,customer_name,customer_since,total_years,total_orders_count,loyalty_points,loyalty_type
001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,Nick Crebassa,2009,15,432,650,Classic
00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9,Barry Pond,2010,14,227,350,Classic
0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f,Giulietta Weimer,2009,15,492,800,Classic
018fc79fefdcb46768279300fb6e32f4e1e9f627ae99e3a672244d3052462e9b,Ashley Jarboe,2009,15,495,950,Classic
01a041b0a5fbe79fb524be166ed14fc9f58054716ce63e9db05a67d35f767527,Christy Brittain,2009,15,604,1500,Daimond


In [0]:
spark.sql(
'''
select * from sales.customer_loyalty_details limit 5
'''
).show(truncate=False)

+----------------------------------------------------------------+----------------+--------------+-----------+------------------+--------------+------------+
|customer_id                                                     |customer_name   |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|
+----------------------------------------------------------------+----------------+--------------+-----------+------------------+--------------+------------+
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa   |2009          |15         |432               |650           |Classic     |
|00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9|Barry Pond      |2010          |14         |227               |350           |Classic     |
|0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f|Giulietta Weimer|2009          |15         |492               |800           |Classic     |
|018fc79fefdcb46768279300fb6e32f4e1e9f627ae99e3a6722

In [0]:
customers_loyalty_df.filter("customer_id = '1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e' ").show(truncate=False)

+-----------+-------------+--------------+-----------+------------------+--------------+------------+----------+----------+
|customer_id|customer_name|customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|order_date|row_number|
+-----------+-------------+--------------+-----------+------------------+--------------+------------+----------+----------+
+-----------+-------------+--------------+-----------+------------------+--------------+------------+----------+----------+



In [0]:
new_customer_data = [
    ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a', "Dave Kipp", 2009, 15,  600, 1100, 'Platinum', '2009-10-11' ),
    ('1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e', "Charles McCrossin",  2010 , 14,  100, 200, 'Gold','2009-07-11'),
    ('1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e', "Andrew Tazas",  2022, 2 , 1, 30, 'Classic','2009-10-13'),
    ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a', "Dave Kipp", 2009, 15,  700, 1100, 'Platinum','2010-11-11' ),
]

# Create DataFrame for new customer data
new_customer_df = spark.createDataFrame(new_customer_data, ['customer_id', 'customer_name', 'customer_since', 'total_years', 'total_orders_count','loyalty_points','loyalty_type','order_date' ])
new_customer_df = new_customer_df.withColumn("order_date",to_date("order_date"))

In [0]:
new_customer_df.show(truncate=False)

+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+----------+
|customer_id                                                     |customer_name    |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|order_date|
+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+----------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |2009          |15         |600               |1100          |Platinum    |2009-10-11|
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|2010          |14         |100               |200           |Gold        |2009-07-11|
|1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Andrew Tazas     |2022          |2          |1                 |30            |Class

In [0]:
new_customer_df = new_customer_df.withColumn("row_number", row_number().over(
    Window.partitionBy("customer_id").orderBy(desc("order_date"))
    )
                                   )
new_customer_df = new_customer_df.filter("row_number == 1").drop("rownumber","order_date")

In [0]:
new_customer_df.show(truncate=False)

+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+----------+
|customer_id                                                     |customer_name    |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|row_number|
+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+----------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |2009          |15         |700               |1100          |Platinum    |1         |
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|2010          |14         |100               |200           |Gold        |1         |
|1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Andrew Tazas     |2022          |2          |1                 |30            |Class

In [0]:
%sql
select * from sales.customer_loyalty_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')

customer_id,customer_name,customer_since,total_years,total_orders_count,loyalty_points,loyalty_type
07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a,Dave Kipp,2009,15,408,700,Classic
1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Charles McCrossin,2010,14,248,350,Classic
1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Andrew Tazas,2022,2,1,30,Classic


In [0]:
spark.sql(
    '''
    select * from sales.customer_loyalty_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')
    '''
).show(truncate=False)

+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+
|customer_id                                                     |customer_name    |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|
+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |2009          |15         |408               |700           |Classic     |
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|2010          |14         |248               |350           |Classic     |
|1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Andrew Tazas     |2022          |2          |1                 |30            |Classic     |
+---------------------------------------------

In [0]:
customer_loyalty_details_instance.alias("Target") \
    .merge(
        source=new_customer_df.alias("Source"),
        condition="Target.customer_id == Source.customer_id"
    ) \
    .whenMatchedUpdate(
        set={
            "customer_name": "Source.customer_name",
            "customer_since": "Source.customer_since",
            "total_years": "Source.total_years",
            "total_orders_count": "Source.total_orders_count",
             "loyalty_points": "Source.loyalty_points",
            "loyalty_type": "Source.loyalty_type"
        }
    ) \
    .whenNotMatchedInsert(
        values={
            "customer_id": "Source.customer_id",
             "customer_name": "Source.customer_name",
            "customer_since": "Source.customer_since",
            "total_years": "Source.total_years",
            "total_orders_count": "Source.total_orders_count",
             "loyalty_points": "Source.loyalty_points",
            "loyalty_type": "Source.loyalty_type"
        }
    ) \
    .execute()

In [0]:
%sql
select * from sales.customer_loyalty_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')

customer_id,customer_name,customer_since,total_years,total_orders_count,loyalty_points,loyalty_type
07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a,Dave Kipp,2009,15,700,1100,Platinum
1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Charles McCrossin,2010,14,100,200,Gold
1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Andrew Tazas,2022,2,1,30,Classic


In [0]:
spark.sql(
    '''
    select * from sales.customer_loyalty_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')
    '''
).show(truncate=False)

+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+
|customer_id                                                     |customer_name    |customer_since|total_years|total_orders_count|loyalty_points|loyalty_type|
+----------------------------------------------------------------+-----------------+--------------+-----------+------------------+--------------+------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |2009          |15         |700               |1100          |Platinum    |
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|2010          |14         |100               |200           |Gold        |
|1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Andrew Tazas     |2022          |2          |1                 |30            |Classic     |
+---------------------------------------------