### Curation Layer

#### Importing libraries

In [None]:
import import_ipynb
from pyspark.sql.functions import col,sum,max,min,when,dense_rank,avg,count,countDistinct,year
from pyspark.sql.window import Window

#### Setting configs and path

In [None]:
from Projects.SHIMANO_CASE_STUDY.raw_layer.file_to_raw import read_input_datasets
from Projects.SHIMANO_CASE_STUDY.common.config import read_csv_file

#### Importing Dataset

In [None]:
brands_df,categories_df,customer_df,order_items_df,orders_df,products_df,staffs_df,stocks_df,stores_df = read_input_datasets()

#### Scenario Solving

##### Find the customer who has placed the highest total value of orders and calculate that value.

In [None]:
customer_df\
    .join(orders_df,customer_df.customer_id==orders_df.customer_id,how='inner')\
    .join(order_items_df,orders_df.order_id==order_items_df.order_id,how='inner')\
    .select(customer_df.customer_id,customer_df.first_name,customer_df.last_name,order_items_df.list_price)\
    .groupBy("customer_id","first_name","last_name")\
    .agg(sum("list_price").alias("total_value"))\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("total_value").desc())))\
    .filter(col("drnk")==1)\
    .drop("drnk")\
    .show()

##### List all products that have never been ordered along with their details.

In [None]:
products_df\
    .join(order_items_df,products_df.product_id==order_items_df.product_id,"left")\
    .filter(order_items_df.product_id.isNull())\
    .select(products_df.product_id)\
    .show()

##### Find the store with the highest average order value.

In [None]:
stores_df\
    .join(orders_df, on=stores_df.store_id == orders_df.store_id,how="inner")\
    .join(order_items_df,on=orders_df.order_id== order_items_df.order_id,how="inner")\
    .select(stores_df.store_id,order_items_df.list_price)\
    .groupby(col("store_id").alias("store_id"))\
    .agg(avg("list_price").alias("avg_list_price"))\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("avg_list_price").desc())))\
    .filter(col("drnk")==1) .drop("drnk")\
    .show()


##### Determine the total revenue generated by each store, considering the discounts applied on order items.

In [None]:
stores_df\
    .join(orders_df, on=stores_df.store_id == orders_df.store_id,how="inner")\
    .join(order_items_df,on=orders_df.order_id== order_items_df.order_id,how="inner")\
    .select(stores_df.store_id,stores_df.store_name,order_items_df.list_price,order_items_df.discount)\
    .withColumn("discount_price",col("list_price")*col("discount"))\
    .withColumn("actual_price",col("list_price")-col("discount_price"))\
    .groupby(col("store_id"),col("store_name"))\
    .agg(sum("actual_price").alias("total_rev"))\
    .show()

 ##### Find the customer who has placed the most orders and list all the orders they have placed.

In [None]:
top_customers = orders_df\
    .groupby("customer_id")\
    .agg(countDistinct("order_id").alias("order_count"))\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("order_count").desc())))\
    .filter(col("drnk")==1) .drop("drnk")


top_customers\
    .join(orders_df,top_customers.customer_id== orders_df.customer_id,"inner")\
    .join(order_items_df,orders_df.order_id == order_items_df.order_id,"inner")\
    .select(top_customers.customer_id,order_items_df.order_id,order_items_df.product_id)\
    .show()

 ##### Calculate the total sales per staff member and find the staff member with the highest sales.

In [None]:
total_sales_staff_df = staffs_df\
    .join(orders_df,on=staffs_df.store_id == orders_df.store_id,how="inner")\
    .join(order_items_df,on=orders_df.order_id == order_items_df.order_id,how="inner")\
    .select(staffs_df.first_name,staffs_df.last_name,order_items_df.list_price)\
    .groupby(col("first_name"),col("last_name"))\
    .agg(sum("list_price").alias("sum_sales"))

total_sales_staff_df\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("sum_sales").desc())))\
    .filter(col("drnk")==1)\
    .show()


##### Identify the product with the highest number of unique customers who have purchased it.

In [None]:
total_unique_customer_product_df = products_df\
    .join(order_items_df,on=products_df.product_id == order_items_df.product_id,how="inner")\
    .join(orders_df,on=order_items_df.order_id==orders_df.order_id,how="inner")\
    .select(products_df.product_id,orders_df.customer_id)\
    .groupby(col("product_id"))\
    .agg(countDistinct("customer_id").alias("total_unique_customer"))

total_unique_customer_product_df\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("total_unique_customer").desc())))\
    .filter(col("drnk")==1)\
    .drop("drnk")\
    .show()


##### Find the brand that has the highest number of products in the orders and the total quantity ordered for each product in that brand.

In [None]:
top_products_orders_df = products_df\
    .join(order_items_df,on=products_df.product_id == order_items_df.product_id,how="right")\
    .join(brands_df,on=products_df.brand_id == brands_df.brand_id,how="inner")\
    .select(brands_df.brand_id,brands_df.brand_name,order_items_df.product_id)\
    .groupby(col("brand_id"),col("brand_name"))\
    .agg(countDistinct("product_id").alias("total_unique_product"))

top_products_orders_df\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("total_unique_product").desc())))\
    .filter(col("drnk")==1)\
    .drop("drnk")\
    .show()

##### Retrieve the total sales and categorize them based on the year. If the year is 2017, label it as "Current Year", if it's 2016, label it as "Last Year", otherwise label it as "Previous Years". Also, calculate the total sales amount for each category.

In [None]:
order_items_df\
    .join(orders_df,on=order_items_df.order_id == orders_df.order_id,how="inner")\
    .withColumn("year",year(col("order_date")))\
    .select("year",order_items_df.list_price)\
    .groupby(col("year"))\
    .agg(sum("list_price").alias("sum_sales"))\
    .withColumn("year_sales",when(col("year")==2018,'Current Year').
                when(col("year")==2017,'Last Year').otherwise("Previous Year"))\
    .show()

##### List all customers and categorize them based on their total spending. If the total spending is more than 10000, label them as "High Spenders", if it's between 5000 and 10000, label them as "Medium Spenders", otherwise label them as "Low Spenders".

In [None]:
order_items_df\
    .join(orders_df,on=order_items_df.order_id == orders_df.order_id,how="inner")\
    .select(orders_df.customer_id,order_items_df.list_price)\
    .groupby(col("customer_id"))\
    .agg(sum("list_price").alias("sum_sales"))\
    .withColumn("customer_type",when(col("sum_sales")>10000,"High Spenders").\
                when(col("sum_sales").between(5000,10000),"Medium Spenders").otherwise("Low Spenders"))\
    .show()

##### Find the staffs and categorize them based on the number of customers they have handled. If the number of customers is more than 50, label them as "Top Performers", between 20 and 50 as "Average Performers", otherwise label them as "Low Performers".

In [None]:
orders_df\
    .join(staffs_df,orders_df.staff_id==staffs_df.staff_id,"inner")\
    .select(staffs_df.staff_id,staffs_df.first_name,staffs_df.last_name,orders_df.customer_id)\
    .groupby(col("staff_id"),col("first_name"),col("last_name"))\
    .agg(countDistinct("customer_id").alias("total_staff_customer"))\
    .withColumn("performer_type",when(col("total_staff_customer")>400,'High Performer').\
                when(col("total_staff_customer").between(200,400),'Medium Performer').otherwise('Low Performer'))\
    .show()