In [0]:
file_location = "/FileStore/tables/customers_refined.csv"
file_type = "csv"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
df = spark.read.format('delta') \
  .load(file_location)

In [0]:
df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_priority: string (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- sales: float (nullable = true)
 |-- discount: float (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- profit: float (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- shipping_cost: float (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)
 |-- ship_date: date (nullable = true)
 |-- loyalty_points: long (nullable = true)
 |-- loyalty_type: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- total_or

In [0]:
product_df = df.select('row_id','customer_id', 'order_id', 'order_date', 'product_category', 'product_sub_category', 'product_name','product_container')

In [0]:
product_df.limit(5).display()

row_id,customer_id,order_id,order_date,product_category,product_sub_category,product_name,product_container
2533,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,18374,2012-09-30 00:00:00,Office Supplies,Paper,Southworth Structures Collection�,Small Box
3548,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,25314,2012-02-24 00:00:00,Technology,Telephones and Communication,Phone 918,Small Box
4364,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16 00:00:00,Office Supplies,Labels,Avery 494,Small Box
4365,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16 00:00:00,Furniture,Office Furnishings,"Eldon� Executive Woodline II Desk Accessories, Mahogany",Small Box
4366,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16 00:00:00,Office Supplies,"Scissors, Rulers and Trimmers",Acme� Preferred Stainless Steel Scissors,Small Pack


In [0]:
product_df = product_df.withColumn("row_id", col("row_id").cast("long")) \
                     .withColumn("order_date", to_date("order_date")) \
                     .withColumn("order_id", col("order_id").cast("long"))

In [0]:
product_df.printSchema()

root
 |-- row_id: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)



In [0]:
# product details will be updated as and when customer makes a purchase
schema = StructType([
    StructField("row_id", IntegerType(), True),  
    StructField("customer_id", StringType(), True), 
    StructField("order_id", IntegerType(), True), 
    StructField("order_date", StringType(), True), 
    StructField("product_category", StringType(), True), 
    StructField("product_sub_category", StringType(), True), 
    StructField("product_name", StringType(), True) ,
    StructField("product_container", StringType(), True) 
])
new_products_data = [
    (8400,"07de8c2693a047311aac3bb2e1a7bf301d9e290b04431926048d92a40e3bbfd1", 59974 ,'2024-04-28','Furniture', "Office Furnitures","Desk accessories","Small Box"),
    (8401,"1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603f", 59975 ,'2024-04-30','Technology',"Communication","Telephone","Small Box")
]

# Create DataFrame for new order data
new_products_df = spark.createDataFrame(new_products_data,schema).withColumn("order_date", to_date("order_date"))

In [0]:
product_df = product_df.union(new_products_df)

In [0]:
product_df.createOrReplaceTempView('products_dtls_vw')

In [0]:
%sql
select count(*) from sales.orders_details where order_status = 'Completed'

count(1)
8115


In [0]:
%sql
select * from sales.orders_details where customer_id='1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603f'

row_id,customer_id,order_id,order_date,ship_date,order_priority,order_quantity,order_status
8402,1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603f,59975,2024-04-30,,Low,10,In Progress
8401,1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603f,59975,2024-04-30,,Low,10,In Progress


In [0]:
%sql
select 
p.row_id,
p.customer_id,
p.order_id,
o.order_date,
p.product_category, 
p.product_sub_category,
p.product_name,
p.product_container
from 
sales.orders_details o,
products_dtls_vw p
where 1=1
and o.row_id = p.row_id
and o.order_status = 'Completed'
limit 5

row_id,customer_id,order_id,order_date,product_category,product_sub_category,product_name,product_container
2533,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,18374,2012-09-30,Office Supplies,Paper,Southworth Structures Collection�,Small Box
3548,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,25314,2012-02-24,Technology,Telephones and Communication,Phone 918,Small Box
4364,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16,Office Supplies,Labels,Avery 494,Small Box
4365,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16,Furniture,Office Furnishings,"Eldon� Executive Woodline II Desk Accessories, Mahogany",Small Box
4366,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16,Office Supplies,"Scissors, Rulers and Trimmers",Acme� Preferred Stainless Steel Scissors,Small Pack


In [0]:
%sql
CREATE OR REPLACE TABLE sales.product_details
(
    product_id LONG,
    customer_id STRING,
    product_order_id LONG,
    product_order_date DATE,
    product_category STRING,
    product_sub_category STRING,
    product_name STRING,
    product_container STRING
)
USING DELTA
LOCATION '/FileStore/tables/delta-table-merge/product_details';

In [0]:
products_order_df = spark.sql(
''' 
select 
p.row_id as product_id,
p.customer_id,
p.order_id as product_order_id,
o.order_date as product_order_date,
p.product_category, 
p.product_sub_category,
p.product_name,
p.product_container
from 
sales.orders_details o,
products_dtls_vw p
where 1=1
and o.row_id = p.row_id
and o.order_status = "Completed"
and p.row_id not in (select product_id from sales.product_details)
'''
)

In [0]:
products_order_df.show(4,truncate=False)

+----------+----------------------------------------------------------------+----------------+------------------+----------------+----------------------------+-------------------------------------------------------+-----------------+
|product_id|customer_id                                                     |product_order_id|product_order_date|product_category|product_sub_category        |product_name                                           |product_container|
+----------+----------------------------------------------------------------+----------------+------------------+----------------+----------------------------+-------------------------------------------------------+-----------------+
|2533      |001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01|18374           |2012-09-30        |Office Supplies |Paper                       |Southworth Structures Collection�                      |Small Box        |
|3548      |001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854

In [0]:
products_order_df.printSchema()

root
 |-- product_id: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_order_id: long (nullable = true)
 |-- product_order_date: date (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)



In [0]:
products_order_df.groupBy("product_id").count().filter("count>1").show()

+----------+-----+
|product_id|count|
+----------+-----+
+----------+-----+



In [0]:
products_order_df.write.format("delta").mode("append").save("/FileStore/tables/delta-table-merge/product_details")

In [0]:
%sql
select * from sales.product_details limit 5

product_id,customer_id,product_order_id,product_order_date,product_category,product_sub_category,product_name,product_container
2533,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,18374,2012-09-30,Office Supplies,Paper,Southworth Structures Collection�,Small Box
3548,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,25314,2012-02-24,Technology,Telephones and Communication,Phone 918,Small Box
4364,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16,Office Supplies,Labels,Avery 494,Small Box
4365,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16,Furniture,Office Furnishings,"Eldon� Executive Woodline II Desk Accessories, Mahogany",Small Box
4366,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,31106,2012-02-16,Office Supplies,"Scissors, Rulers and Trimmers",Acme� Preferred Stainless Steel Scissors,Small Pack


In [0]:
%sql
select count(product_id), product_id from sales.product_details group by product_id having count(product_id)>1

count(product_id),product_id


In [0]:
%sql
select * from sales.product_details where customer_id in ('1212e8749bdc3800d2514d8ae53c10b8cbe3a9e4355de5db35f746c510c4603f','07de8c2693a047311aac3bb2e1a7bf301d9e290b04431926048d92a40e3bbfd1')

product_id,customer_id,product_order_id,product_order_date,product_category,product_sub_category,product_name,product_container
8400,07de8c2693a047311aac3bb2e1a7bf301d9e290b04431926048d92a40e3bbfd1,59974,2024-04-28,Furniture,Office Furnitures,Desk accessories,Small Box
