In [0]:
file_location = "/FileStore/tables/customers_refined.csv"
file_type = "csv"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
df = spark.read.format('delta') \
  .load(file_location)

In [0]:
df.show(5,truncate=False)

+------+--------+-------------------+--------------+--------------+--------+--------+-----------+-------+----------+-------------+-------------+-----------+------+----------------+----------------+-----------------------------+-------------------------------------------------------+-----------------+----------+--------------+------------+----------------------------------------------------------------+------------------+--------------+-----------+
|row_id|order_id|order_date         |order_priority|order_quantity|sales   |discount|ship_mode  |profit |unit_price|shipping_cost|customer_name|province   |region|customer_segment|product_category|product_sub_category         |product_name                                           |product_container|ship_date |loyalty_points|loyalty_type|customer_id                                                     |total_orders_count|customer_since|total_years|
+------+--------+-------------------+--------------+--------------+--------+--------+-----------

In [0]:
df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_priority: string (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- sales: float (nullable = true)
 |-- discount: float (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- profit: float (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- shipping_cost: float (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)
 |-- ship_date: date (nullable = true)
 |-- loyalty_points: long (nullable = true)
 |-- loyalty_type: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- total_or

In [0]:
customers_df = df.select('customer_id', 'customer_name', 'region', 'province', 'customer_since',"order_date")

In [0]:
customers_df.limit(4).show(truncate=False)

+----------------------------------------------------------------+-------------+------+-----------+--------------+-------------------+
|customer_id                                                     |customer_name|region|province   |customer_since|order_date         |
+----------------------------------------------------------------+-------------+------+-----------+--------------+-------------------+
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|Prarie|Saskachewan|2009          |2012-09-30 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|Prarie|Saskachewan|2009          |2012-02-24 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|Prarie|Saskachewan|2009          |2012-02-16 00:00:00|
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa|Prarie|Saskachewan|2009          |2012-02-16 00:00:00|
+------------------------------------------------------

In [0]:
customers_df.count()

Out[42]: 8399

In [0]:
customers_df.filter("customer_id = '07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a' ").orderBy(desc("order_date")).show(truncate=False)

+----------------------------------------------------------------+-------------+------+----------------+--------------+-------------------+
|customer_id                                                     |customer_name|region|province        |customer_since|order_date         |
+----------------------------------------------------------------+-------------+------+----------------+--------------+-------------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp    |West  |British Columbia|2009          |2012-10-25 00:00:00|
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp    |Yukon |Yukon           |2009          |2011-10-30 00:00:00|
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp    |West  |British Columbia|2009          |2011-07-02 00:00:00|
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp    |West  |British Columbia|2009          |2011-07-02 00:00:00|
|07c41f7f8da2ed563ce

In [0]:
#To keep latest details
customers_df = customers_df.withColumn("row_number", row_number().over(
    Window.partitionBy("customer_id").orderBy(desc("order_date"))
    )
                                   )

In [0]:
latest_customers_df = customers_df.filter("row_number = 1").drop("row_number","order_date")

In [0]:
latest_customers_df.filter("customer_id = '07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a' ").show(truncate=False)

+----------------------------------------------------------------+-------------+------+----------------+--------------+
|customer_id                                                     |customer_name|region|province        |customer_since|
+----------------------------------------------------------------+-------------+------+----------------+--------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp    |West  |British Columbia|2009          |
+----------------------------------------------------------------+-------------+------+----------------+--------------+



In [0]:
latest_customers_df.count()

Out[47]: 795

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS sales;

In [0]:
%sql
CREATE OR REPLACE TABLE sales.customer_details
(
    customer_id STRING,
    customer_name STRING,
    region STRING,
    province STRING,
    customer_since INT
)
USING DELTA
LOCATION '/FileStore/tables/delta-table-merge/customer_details';

In [0]:
%sql
select * from sales.customer_details

customer_id,customer_name,region,province,customer_since


In [0]:
# creating an instance of delta table
from delta.tables import *

customer_details_instance = DeltaTable.forPath(spark, "/FileStore/tables/delta-table-merge/customer_details")
print(type(customer_details_instance))

<class 'delta.tables.DeltaTable'>


In [0]:
customer_details_instance.alias("Target") \
    .merge(
        source=latest_customers_df.alias("Source"),
        condition="Target.customer_id == Source.customer_id"
    ) \
    .whenMatchedUpdate(
        set={
            "customer_name": "Source.customer_name",
            "province": "Source.province",
            "region": "Source.region",
            "customer_since": "Source.customer_since"
        }
    ) \
    .whenNotMatchedInsert(
        values={
            "customer_id": "Source.customer_id",
            "customer_name": "Source.customer_name",
            "province": "Source.province",
            "region": "Source.region",
            "customer_since": "Source.customer_since"
        }
    ) \
    .execute()


In [0]:
%sql
select * from sales.customer_details limit 5

customer_id,customer_name,region,province,customer_since
001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,Nick Crebassa,Prarie,Saskachewan,2009
00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9,Barry Pond,Yukon,Yukon,2010
0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f,Giulietta Weimer,West,British Columbia,2009
018fc79fefdcb46768279300fb6e32f4e1e9f627ae99e3a672244d3052462e9b,Ashley Jarboe,Yukon,Yukon,2009
01a041b0a5fbe79fb524be166ed14fc9f58054716ce63e9db05a67d35f767527,Christy Brittain,Yukon,Yukon,2009


In [0]:
spark.sql(
'''
select * from sales.customer_details limit 5
'''
).show(truncate=False)

+----------------------------------------------------------------+----------------+------+----------------+--------------+
|customer_id                                                     |customer_name   |region|province        |customer_since|
+----------------------------------------------------------------+----------------+------+----------------+--------------+
|001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|Nick Crebassa   |Prarie|Saskachewan     |2009          |
|00fcf5c6646b01ce72417b854ad2ebcff2a73c4beec096c90dd9d1e60a7fd1e9|Barry Pond      |Yukon |Yukon           |2010          |
|0105946ed08fb4ccf5f3efcec88a83121bb5804c698b14f2ad20cf7dd2541f7f|Giulietta Weimer|West  |British Columbia|2009          |
|018fc79fefdcb46768279300fb6e32f4e1e9f627ae99e3a672244d3052462e9b|Ashley Jarboe   |Yukon |Yukon           |2009          |
|01a041b0a5fbe79fb524be166ed14fc9f58054716ce63e9db05a67d35f767527|Christy Brittain|Yukon |Yukon           |2009          |
+---------------

In [0]:
customers_df.filter("customer_id = '1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e' ").show(truncate=False)

+-----------+-------------+------+--------+--------------+----------+----------+
|customer_id|customer_name|region|province|customer_since|order_date|row_number|
+-----------+-------------+------+--------+--------------+----------+----------+
+-----------+-------------+------+--------+--------------+----------+----------+



In [0]:
new_customer_data = [
    ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a', "Dave Kipp", "Origia", "Yukon",  2009 ),
    ('1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e', "Charles McCrossin",  "Northwest Territories", "Kia",  2010),
    ('1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e', "Andrew Tazas",  "Texas", "Texas", 2022)
]

# Create DataFrame for new customer data
new_customer_df = spark.createDataFrame(new_customer_data, ["customer_id", "customer_name", "region", "province", "customer_since" ])

In [0]:
new_customer_df.show(truncate=False)

+----------------------------------------------------------------+-----------------+---------------------+--------+--------------+
|customer_id                                                     |customer_name    |region               |province|customer_since|
+----------------------------------------------------------------+-----------------+---------------------+--------+--------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |Origia               |Yukon   |2009          |
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|Northwest Territories|Kia     |2010          |
|1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Andrew Tazas     |Texas                |Texas   |2022          |
+----------------------------------------------------------------+-----------------+---------------------+--------+--------------+



In [0]:
%sql
select * from sales.customer_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')

customer_id,customer_name,region,province,customer_since
07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a,Dave Kipp,West,British Columbia,2009
1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Charles McCrossin,Northwest Territories,Northwest Territories,2010


In [0]:
spark.sql(
'''
select * from sales.customer_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')
'''
).show(truncate=False)

+----------------------------------------------------------------+-----------------+---------------------+---------------------+--------------+
|customer_id                                                     |customer_name    |region               |province             |customer_since|
+----------------------------------------------------------------+-----------------+---------------------+---------------------+--------------+
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |West                 |British Columbia     |2009          |
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|Northwest Territories|Northwest Territories|2010          |
+----------------------------------------------------------------+-----------------+---------------------+---------------------+--------------+



In [0]:
customer_details_instance.alias("Target") \
    .merge(
        source=new_customer_df.alias("Source"),
        condition="Target.customer_id == Source.customer_id"
    ) \
    .whenMatchedUpdate(
        set={
            "customer_name": "Source.customer_name",
            "province": "Source.province",
            "region": "Source.region",
            "customer_since": "Source.customer_since"
        }
    ) \
    .whenNotMatchedInsert(
        values={
            "customer_id": "Source.customer_id",
            "customer_name": "Source.customer_name",
            "province": "Source.province",
            "region": "Source.region",
            "customer_since": "Source.customer_since"
        }
    ) \
    .execute()

In [0]:
%sql
select * from sales.customer_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')

customer_id,customer_name,region,province,customer_since
1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Charles McCrossin,Northwest Territories,Kia,2010
1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e,Andrew Tazas,Texas,Texas,2022
07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a,Dave Kipp,Origia,Yukon,2009


In [0]:
spark.sql(
'''
select * from sales.customer_details where customer_id in ('07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a','1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e','1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e')
'''
).show(truncate=False)

+----------------------------------------------------------------+-----------------+---------------------+--------+--------------+
|customer_id                                                     |customer_name    |region               |province|customer_since|
+----------------------------------------------------------------+-----------------+---------------------+--------+--------------+
|1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Charles McCrossin|Northwest Territories|Kia     |2010          |
|1312e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603e|Andrew Tazas     |Texas                |Texas   |2022          |
|07c41f7f8da2ed563ce0452e1d066605494817333732bafb29d6c596e6aeb39a|Dave Kipp        |Origia               |Yukon   |2009          |
+----------------------------------------------------------------+-----------------+---------------------+--------+--------------+

