In [0]:
file_location = "/FileStore/tables/customers_order.csv"
file_type = "csv"

In [0]:
infer_schema = '''
row_id string,
order_id string,
order_date string,
order_priority string,
order_quantity long,
sales float,
discount float,
ship_mode string,
profit float,
unit_price float,
shipping_cost float,
customer_name string,
province string,
region string,
customer_segment string,
product_category string,
product_sub_category string,
product_name string,
product_container string,
ship_date string
'''
first_row_is_header = "true"
delimiter = ","
mode = "PERMISSIVE"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
df = spark.read.format(file_type) \
  .option("mode",mode) \
  .schema(infer_schema) \
  .option("sep", delimiter) \
  .option("header",first_row_is_header) \
  .load(file_location)

In [0]:
df.limit(5).display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date
1,3,40464,Low,6,261.54,0.04,Regular Air,-213.25,38.94,35.0,Muhammed MacIntyre,Nunavut,Nunavut,Small Business,Office Supplies,Storage & Organization,"Eldon Base for stackable storage shelf, platinum",Large Box,20-10-2010
49,293,41183,High,49,10123.02,0.07,Delivery Truck,457.81,208.16,68.02,Barry French,Nunavut,Nunavut,Consumer,Office Supplies,Appliances,"""1.7 Cubic Foot Compact """"Cube"""" Office Refrigerators""",Jumbo Drum,02-10-2012
50,293,41183,High,27,244.57,0.01,Regular Air,46.71,8.69,2.99,Barry French,Nunavut,Nunavut,Consumer,Office Supplies,Binders and Binder Accessories,"Cardinal Slant-D� Ring Binder, Heavy Gauge Vinyl",Small Box,03-10-2012
80,483,40734,High,30,4965.7593,0.08,Regular Air,1198.97,195.99,3.99,Clay Rozendal,Nunavut,Nunavut,Corporate,Technology,Telephones and Communication,R380,Small Box,12-07-2011
85,515,40418,Not Specified,19,394.27,0.08,Regular Air,30.94,21.78,5.94,Carlos Soltero,Nunavut,Nunavut,Consumer,Office Supplies,Appliances,Holmes HEPA Air Purifier,Medium Box,30-08-2010


In [0]:
df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_priority: string (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- sales: float (nullable = true)
 |-- discount: float (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- profit: float (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- shipping_cost: float (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)
 |-- ship_date: string (nullable = true)



In [0]:
df = df.withColumn('ship_date', to_date('ship_date', 'dd-MM-yyyy'))

In [0]:
df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_priority: string (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- sales: float (nullable = true)
 |-- discount: float (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- profit: float (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- shipping_cost: float (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)
 |-- ship_date: date (nullable = true)



In [0]:
df=df.withColumn('loyalty_points',lit(50)) \
    .withColumn('loyalty_type', lit('Classic'))

In [0]:
df.limit(5).display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type
1,3,40464,Low,6,261.54,0.04,Regular Air,-213.25,38.94,35.0,Muhammed MacIntyre,Nunavut,Nunavut,Small Business,Office Supplies,Storage & Organization,"Eldon Base for stackable storage shelf, platinum",Large Box,2010-10-20,50,Classic
49,293,41183,High,49,10123.02,0.07,Delivery Truck,457.81,208.16,68.02,Barry French,Nunavut,Nunavut,Consumer,Office Supplies,Appliances,"""1.7 Cubic Foot Compact """"Cube"""" Office Refrigerators""",Jumbo Drum,2012-10-02,50,Classic
50,293,41183,High,27,244.57,0.01,Regular Air,46.71,8.69,2.99,Barry French,Nunavut,Nunavut,Consumer,Office Supplies,Binders and Binder Accessories,"Cardinal Slant-D� Ring Binder, Heavy Gauge Vinyl",Small Box,2012-10-03,50,Classic
80,483,40734,High,30,4965.7593,0.08,Regular Air,1198.97,195.99,3.99,Clay Rozendal,Nunavut,Nunavut,Corporate,Technology,Telephones and Communication,R380,Small Box,2011-07-12,50,Classic
85,515,40418,Not Specified,19,394.27,0.08,Regular Air,30.94,21.78,5.94,Carlos Soltero,Nunavut,Nunavut,Consumer,Office Supplies,Appliances,Holmes HEPA Air Purifier,Medium Box,2010-08-30,50,Classic


In [0]:
df.orderBy('customer_name').limit(10).display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type
606,4132,40691,Not Specified,5,14.76,0.01,Regular Air,1.32,2.88,0.5,Aaron Bergman,Nunavut,Nunavut,Corporate,Office Supplies,Labels,Avery 49,Small Box,2011-05-30,50,Classic
5087,36262,40386,Not Specified,23,136.81,0.01,Regular Air,-30.51,5.68,3.6,Aaron Bergman,Alberta,West,Corporate,Office Supplies,"Scissors, Rulers and Trimmers",Acme� Preferred Stainless Steel Scissors,Small Pack,2010-07-28,50,Classic
2847,20513,40001,High,13,42.27,0.01,Express Air,4.56,2.84,0.93,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Pens & Art Supplies,SANFORD Liquid Accent� Tank-Style Highlighters,Wrap Bag,2009-07-08,50,Classic
5086,36262,40386,Not Specified,23,164.02,0.03,Express Air,-47.64,6.68,6.15,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Paper,Xerox 1968,Small Box,2010-07-28,50,Classic
5088,36262,40386,Not Specified,26,4701.69,0.0,Express Air,1148.9,205.99,2.5,Aaron Bergman,Alberta,West,Corporate,Technology,Telephones and Communication,V70,Small Box,2010-07-27,50,Classic
5597,39682,40491,Medium,43,2337.89,0.09,Express Air,729.34,55.48,14.3,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Paper,Xerox 194,Small Box,2010-11-11,50,Classic
7549,54019,39995,Low,35,4233.15,0.08,Delivery Truck,1219.87,120.97,26.3,Aaron Bergman,Alberta,West,Corporate,Technology,Office Machines,Canon S750 Color Inkjet Printer,Jumbo Drum,2009-07-08,50,Classic
2228,16102,40160,Not Specified,25,13255.93,0.02,Delivery Truck,4089.27,500.98,26.0,Aaron Hawkins,Yukon,Yukon,Home Office,Furniture,Chairs & Chairmats,Global Troy� Executive Leather Low-Back Tilter,Jumbo Drum,2009-12-15,50,Classic
3776,26949,40040,Critical,48,460.69,0.06,Regular Air,-103.48,9.48,7.29,Aaron Hawkins,Quebec,Quebec,Home Office,Furniture,Office Furnishings,"DAX Two-Tone Rosewood/Black Document Frame, Desktop, 5 x 7",Small Pack,2009-08-17,50,Classic
6563,46662,40906,Critical,8,57.22,0.07,Regular Air,-27.72,6.48,6.6,Aaron Hawkins,British Columbia,West,Home Office,Office Supplies,Paper,Xerox 21,Small Box,2011-12-31,50,Classic


In [0]:
#create a unique hash value considering customer name as key column since we don't have customer ID
df = df.withColumn('customer_id',
                   sha2(
                       'customer_name',256
                        )
                   )

In [0]:
df.orderBy('customer_name').limit(10).display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id
606,4132,40691,Not Specified,5,14.76,0.01,Regular Air,1.32,2.88,0.5,Aaron Bergman,Nunavut,Nunavut,Corporate,Office Supplies,Labels,Avery 49,Small Box,2011-05-30,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
5087,36262,40386,Not Specified,23,136.81,0.01,Regular Air,-30.51,5.68,3.6,Aaron Bergman,Alberta,West,Corporate,Office Supplies,"Scissors, Rulers and Trimmers",Acme� Preferred Stainless Steel Scissors,Small Pack,2010-07-28,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
2847,20513,40001,High,13,42.27,0.01,Express Air,4.56,2.84,0.93,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Pens & Art Supplies,SANFORD Liquid Accent� Tank-Style Highlighters,Wrap Bag,2009-07-08,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
5086,36262,40386,Not Specified,23,164.02,0.03,Express Air,-47.64,6.68,6.15,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Paper,Xerox 1968,Small Box,2010-07-28,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
5088,36262,40386,Not Specified,26,4701.69,0.0,Express Air,1148.9,205.99,2.5,Aaron Bergman,Alberta,West,Corporate,Technology,Telephones and Communication,V70,Small Box,2010-07-27,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
5597,39682,40491,Medium,43,2337.89,0.09,Express Air,729.34,55.48,14.3,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Paper,Xerox 194,Small Box,2010-11-11,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
7549,54019,39995,Low,35,4233.15,0.08,Delivery Truck,1219.87,120.97,26.3,Aaron Bergman,Alberta,West,Corporate,Technology,Office Machines,Canon S750 Color Inkjet Printer,Jumbo Drum,2009-07-08,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473
2228,16102,40160,Not Specified,25,13255.93,0.02,Delivery Truck,4089.27,500.98,26.0,Aaron Hawkins,Yukon,Yukon,Home Office,Furniture,Chairs & Chairmats,Global Troy� Executive Leather Low-Back Tilter,Jumbo Drum,2009-12-15,50,Classic,9f7b79797209ed1fb0f0348b09c1a0eed1a119d9b648f48bae63191a85203458
3776,26949,40040,Critical,48,460.69,0.06,Regular Air,-103.48,9.48,7.29,Aaron Hawkins,Quebec,Quebec,Home Office,Furniture,Office Furnishings,"DAX Two-Tone Rosewood/Black Document Frame, Desktop, 5 x 7",Small Pack,2009-08-17,50,Classic,9f7b79797209ed1fb0f0348b09c1a0eed1a119d9b648f48bae63191a85203458
6563,46662,40906,Critical,8,57.22,0.07,Regular Air,-27.72,6.48,6.6,Aaron Hawkins,British Columbia,West,Home Office,Office Supplies,Paper,Xerox 21,Small Box,2011-12-31,50,Classic,9f7b79797209ed1fb0f0348b09c1a0eed1a119d9b648f48bae63191a85203458


In [0]:
df = df.withColumn('total_orders_count',
                  sum('order_quantity').over(Window.partitionBy('customer_id'))
                   )

In [0]:
df.filter("customer_id = 'cbb9188c2ae3ed75981b8a7c44317764a6aae1c64da3518947f6e9c87c985ed1'").display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count


In [0]:
#Convert date columns into panda datatime
# the column 'Order Date' is having unix  time cast as double which has to be converted to datetime format
df = df.withColumn('order_date',from_unixtime((col('order_date') - 25569) * 86400.0))

In [0]:
df.orderBy('customer_name').limit(10).display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count
606,4132,2011-05-28 00:00:00,Not Specified,5,14.76,0.01,Regular Air,1.32,2.88,0.5,Aaron Bergman,Nunavut,Nunavut,Corporate,Office Supplies,Labels,Avery 49,Small Box,2011-05-30,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
2847,20513,2009-07-07 00:00:00,High,13,42.27,0.01,Express Air,4.56,2.84,0.93,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Pens & Art Supplies,SANFORD Liquid Accent� Tank-Style Highlighters,Wrap Bag,2009-07-08,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
5086,36262,2010-07-27 00:00:00,Not Specified,23,164.02,0.03,Express Air,-47.64,6.68,6.15,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Paper,Xerox 1968,Small Box,2010-07-28,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
5087,36262,2010-07-27 00:00:00,Not Specified,23,136.81,0.01,Regular Air,-30.51,5.68,3.6,Aaron Bergman,Alberta,West,Corporate,Office Supplies,"Scissors, Rulers and Trimmers",Acme� Preferred Stainless Steel Scissors,Small Pack,2010-07-28,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
5088,36262,2010-07-27 00:00:00,Not Specified,26,4701.69,0.0,Express Air,1148.9,205.99,2.5,Aaron Bergman,Alberta,West,Corporate,Technology,Telephones and Communication,V70,Small Box,2010-07-27,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
5597,39682,2010-11-09 00:00:00,Medium,43,2337.89,0.09,Express Air,729.34,55.48,14.3,Aaron Bergman,Alberta,West,Corporate,Office Supplies,Paper,Xerox 194,Small Box,2010-11-11,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
7549,54019,2009-07-01 00:00:00,Low,35,4233.15,0.08,Delivery Truck,1219.87,120.97,26.3,Aaron Bergman,Alberta,West,Corporate,Technology,Office Machines,Canon S750 Color Inkjet Printer,Jumbo Drum,2009-07-08,50,Classic,f2d69f1bfbac52e4237b5c8564ec8af969b383bde3734cca607a6358b592c473,168
3866,27559,2011-10-30 00:00:00,High,38,465.9,0.05,Regular Air,79.34,12.28,4.86,Aaron Hawkins,Nova Scotia,Atlantic,Home Office,Office Supplies,Paper,Xerox 1933,Small Box,2011-10-31,50,Classic,9f7b79797209ed1fb0f0348b09c1a0eed1a119d9b648f48bae63191a85203458,370
6564,46662,2011-12-29 00:00:00,Critical,33,162.0,0.01,Regular Air,45.84,4.84,0.71,Aaron Hawkins,British Columbia,West,Home Office,Office Supplies,Pens & Art Supplies,*Staples* Highlighting Markers,Wrap Bag,2011-12-31,50,Classic,9f7b79797209ed1fb0f0348b09c1a0eed1a119d9b648f48bae63191a85203458,370
2876,20737,2011-12-25 00:00:00,Medium,10,1410.93,0.08,Delivery Truck,-317.48,140.98,36.09,Aaron Hawkins,Quebec,Quebec,Home Office,Furniture,Bookcases,"Sauder Forest Hills Library, Woodland Oak Finish",Jumbo Box,2011-12-26,50,Classic,9f7b79797209ed1fb0f0348b09c1a0eed1a119d9b648f48bae63191a85203458,370


In [0]:
df.filter("customer_id = 'cbb9188c2ae3ed75981b8a7c44317764a6aae1c64da3518947f6e9c87c985ed1'").display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count


In [0]:
df = df.withColumn('customer_since',
                  min(year('order_date')).over(Window.partitionBy('customer_id'))
                   )

In [0]:
df.filter("customer_id = 'cbb9188c2ae3ed75981b8a7c44317764a6aae1c64da3518947f6e9c87c985ed1'").display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count,customer_since


In [0]:
current_year = year(current_date())
df = df.withColumn('total_years',current_year - col('customer_since' ))

In [0]:
df.filter("customer_id = 'cbb9188c2ae3ed75981b8a7c44317764a6aae1c64da3518947f6e9c87c985ed1'").display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count,customer_since,total_years


In [0]:
df = df.withColumn('loyalty_points',
                  when(col('total_orders_count')> 500,(count('customer_id').over(Window.partitionBy('customer_id')) * 50) + 200) 
                  .when((col('total_orders_count')>= 500) & (col('total_orders_count')< 1500),(count('customer_id').over(Window.partitionBy('customer_id')) * 50) + 700)
                  .when((col('total_orders_count')>= 1500) & (col('total_orders_count')< 2000),(count('customer_id').over(Window.partitionBy('customer_id')) * 50) + 1200)
                  .when((col('total_orders_count')>= 2000) & (col('total_orders_count')< 2500),(count('customer_id').over(Window.partitionBy('customer_id')) * 50) + 1700)
                  .when((col('total_orders_count')>= 2500) & (col('total_orders_count')< 3000),(count('customer_id').over(Window.partitionBy('customer_id')) * 50) + 2200)
                  .otherwise(count('customer_id').over(Window.partitionBy('customer_id')) * 50)
                   )

In [0]:
df.filter("customer_id = 'cbb9188c2ae3ed75981b8a7c44317764a6aae1c64da3518947f6e9c87c985ed1'").display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count,customer_since,total_years


In [0]:
df = df.withColumn('loyalty_type',
                  when(((col('total_orders_count')>= 1) & (col('total_orders_count')< 300)) & ((col('total_years')>=12) & (col('total_years')<=13)) & ((col('loyalty_points')>=50) & (col('loyalty_points')<600)),'Gold') 
                  .when(((col('total_orders_count')>= 300) & (col('total_orders_count')< 500)) & ((col('total_years')>13) & (col('total_years')<=14)) & ((col('loyalty_points')>=600) & (col('loyalty_points')<1000)),'Platinum') 
                  .when(((col('total_orders_count')>= 500) & (col('total_orders_count')< 1500)) & ((col('total_years')>14) & (col('total_years')<=15) )& ((col('loyalty_points')>=1000) & (col('loyalty_points')<2000)),'Daimond') 
                  .when(((col('total_orders_count')>= 1500)) & ((col('total_years')>15)) & (col('loyalty_points')>=2000),'Elite') 
                  .otherwise(col('loyalty_type'))
                   )

In [0]:
total_customers_count = df.count()
classic_count = df.filter("loyalty_type = 'Classic'").count()
gold_count = df.filter("loyalty_type = 'Gold'").count()
platinum_count = df.filter("loyalty_type = 'Platinum'").count()
daimond_count = df.filter("loyalty_type = 'Daimond'").count()
elite_count = df.filter("loyalty_type = 'Elite'").count()

In [0]:
print("total customers : ",total_customers_count)
print("classic customers : ",classic_count)
print("gold customers : ",gold_count)
print("platinum customers : ",platinum_count)
print("daimond customers : ",daimond_count)
print("elite customers : ",elite_count)

total customers :  8399
classic customers :  6436
gold customers :  142
platinum customers :  83
daimond customers :  1738
elite customers :  0


In [0]:
df.limit(1).display()

row_id,order_id,order_date,order_priority,order_quantity,sales,discount,ship_mode,profit,unit_price,shipping_cost,customer_name,province,region,customer_segment,product_category,product_sub_category,product_name,product_container,ship_date,loyalty_points,loyalty_type,customer_id,total_orders_count,customer_since,total_years
2533,18374,2012-09-30 00:00:00,Low,21,150.16,0.09,Regular Air,-32.78,7.28,5.47,Nick Crebassa,Saskachewan,Prarie,Corporate,Office Supplies,Paper,Southworth Structures Collection�,Small Box,2012-10-02,650,Classic,001492ac094ba3c986a45e7799a0409db64ed3c856e39cf16854012aa468fa01,432,2009,15


In [0]:
df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_priority: string (nullable = true)
 |-- order_quantity: long (nullable = true)
 |-- sales: float (nullable = true)
 |-- discount: float (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- profit: float (nullable = true)
 |-- unit_price: float (nullable = true)
 |-- shipping_cost: float (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region: string (nullable = true)
 |-- customer_segment: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_sub_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_container: string (nullable = true)
 |-- ship_date: date (nullable = true)
 |-- loyalty_points: long (nullable = false)
 |-- loyalty_type: string (nullable = false)
 |-- customer_id: string (nullable = true)
 |-- total_

In [0]:
df.write \
    .mode('overwrite') \
    .option('path',"/FileStore/tables/customers_refined.csv") \
    .save()


create  below tables

customer_details will have customer_name, region, province, customer_segment and customer_id where details such as region, province and customer_segment 
will be overwritten as those might change over time

order_details will have order_id, order_date, ship_date, order_priority, order_quantity, customer_id, 
order_status(in progress with null in ship_date if delivery in progress 
if cancelled then cancelled in order_status with null in ship_date
if delivered then completed in order_status with delivered date in ship_date) 
when a customer does new order it will be inserted as new row with current order_date and ship_date

product_details will have customer_id, order_id, product_id, product_category, product_sub_category, product_name, product_container, product_order_date (order_date) 
--when a customer does new order and if order_status = 'Completed' then only it will be inserted in this table as new row with updated values in columns against that customer_id

loyalty_details : customer_id, customer_since, total_years, total_orders_count, loyalty_points and loyalty_type 
---will have latest loyalty_details against each customer_id 

customer_loyalty_dtls_hist : customer_id, region, province, customer_segment, total_orders_count, total_years, customer_since, loyalty_points, loyalty_type 
with additional columns start_date and end_date where all the rows will be inserted as new rows with updated start_date and end_date

future enhancement: to have customer_status like active, inactive, suspended, service_blocked, etc. 
,to have root and line items
,to categorize products into attributes
