### Curation Layer

#### Importing libraries

In [None]:
import import_ipynb
from pyspark.sql.functions import col,sum,max,min,when,dense_rank,avg,count,countDistinct
from pyspark.sql.window import Window

#### Setting configs and path

In [None]:
from Projects.SHIMANO_CASE_STUDY.raw_layer.file_to_raw import read_input_datasets
from Projects.SHIMANO_CASE_STUDY.common.config import read_csv_file

#### Importing Dataset

In [None]:
# brands_df,categories_df,customer_df,order_items_df,orders_df,products_df,staffs_df,stocks_df,stores_df = read_input_datasets()
brands_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/brands.csv")
categories_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/categories.csv")
customer_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/customers.csv")
order_items_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/order_items.csv")
orders_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/orders.csv")
products_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/products.csv")
staffs_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/staffs.csv")
stocks_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/stocks.csv")
stores_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/SHIMANO_CASE_STUDY/dataset/stores.csv")

#### Scenario Solving

##### Find the customer who has placed the highest total value of orders and calculate that value.

In [None]:
customer_df\
    .join(orders_df,customer_df.customer_id==orders_df.customer_id,how='inner')\
    .join(order_items_df,orders_df.order_id==order_items_df.order_id,how='inner')\
    .select(customer_df.customer_id,customer_df.first_name,customer_df.last_name,order_items_df.list_price)\
    .groupBy("customer_id","first_name","last_name")\
    .agg(sum("list_price").alias("total_value"))\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("total_value").desc())))\
    .filter(col("drnk")==1)\
    .drop("drnk")\
    .show()

##### List all products that have never been ordered along with their details.

In [None]:
products_df\
    .join(order_items_df,products_df.product_id==order_items_df.product_id,"left")\
    .filter(order_items_df.product_id.isNull())\
    .select(products_df.product_id)\
    .show()

##### Find the store with the highest average order value.

In [None]:
stores_df\
    .join(orders_df, on=stores_df.store_id == orders_df.store_id,how="inner")\
    .join(order_items_df,on=orders_df.order_id== order_items_df.order_id,how="inner")\
    .select(stores_df.store_id,order_items_df.list_price)\
    .groupby(col("store_id").alias("store_id"))\
    .agg(avg("list_price").alias("avg_list_price"))\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("avg_list_price").desc())))\
    .filter(col("drnk")==1) .drop("drnk")\
    .show()


##### Determine the total revenue generated by each store, considering the discounts applied on order items.

In [None]:
stores_df\
    .join(orders_df, on=stores_df.store_id == orders_df.store_id,how="inner")\
    .join(order_items_df,on=orders_df.order_id== order_items_df.order_id,how="inner")\
    .select(stores_df.store_id,stores_df.store_name,order_items_df.list_price,order_items_df.discount)\
    .withColumn("discount_price",col("list_price")*col("discount"))\
    .withColumn("actual_price",col("list_price")-col("discount_price"))\
    .groupby(col("store_id"),col("store_name"))\
    .agg(sum("actual_price").alias("total_rev"))\
    .show()

 ##### Find the customer who has placed the most orders and list all the orders they have placed.

In [None]:
top_customers = orders_df\
    .groupby("customer_id")\
    .agg(countDistinct("order_id").alias("order_count"))\
    .withColumn("drnk",dense_rank().over(Window.orderBy(col("order_count").desc())))\
    .filter(col("drnk")==1) .drop("drnk")


top_customers\
    .join(orders_df,top_customers.customer_id== orders_df.customer_id,"inner")\
    .join(order_items_df,orders_df.order_id == order_items_df.order_id,"inner")\
    .select(top_customers.customer_id,order_items_df.order_id,order_items_df.product_id)\
    .show()