### Curation Layer

import libraries and functions.

In [None]:
from config import spark_session,read_csv_file
from pyspark.sql.functions import col, when,avg,date_format,to_date,sum,dense_rank,round
from pyspark.sql.window import Window

In [None]:
spark = spark_session()

read raw data

In [None]:
curated_df = read_csv_file(path="/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/WALMART_SALE_PERUSAL/input_data/WALMART_SALES_DATA.csv")

### Curation Analytics

Average weekly sales for each store.

In [None]:
avg_weekly_sale_each_store_df = curated_df\
    .groupBy(col("Store"))\
    .agg(avg("Weekly_Sales").alias("Avg_Weekly_Sales"))


Top Performing Stores

In [None]:
top_5_avg_weekly_sale_each_store_df = avg_weekly_sale_each_store_df\
    .withColumn("rnk",dense_rank().over(Window.orderBy(col("Avg_Weekly_Sales"))))\
    .filter(col("rnk") <= 5)

Sales Comparison During Holidays vs. Non-Holidays

In [None]:
non_holiday_df = curated_df\
    .select("Store","Weekly_Sales","Holiday_Flag")\
    .filter(col("Holiday_Flag")==0)\
    .groupby(col("Store").alias("nh_store_id"))\
    .agg(avg("Weekly_Sales").alias("sales_during_non_holiday"))

holiday_df = curated_df\
    .select("Store","Weekly_Sales","Holiday_Flag")\
    .filter(col("Holiday_Flag")==1)\
    .groupby(col("Store").alias("h_store_id"))\
    .agg(avg("Weekly_Sales").alias("sales_during_holiday"))

holiday_vs_non_holiday_df = non_holiday_df\
    .join(holiday_df,on=non_holiday_df.nh_store_id == holiday_df.h_store_id,how="inner")\
    .withColumn("sales_affect",when(col("sales_during_non_holiday")>col("sales_during_holiday"),"more_non_holiday_sale").otherwise("more_holiday_sale"))\
    .select(col("h_store_id").alias("store_id"),"sales_during_non_holiday","sales_during_holiday","sales_affect")

holiday_vs_non_holiday_df.show(truncate=False)

holiday_vs_non_holiday_df\
    .select("sales_affect")\
    .groupby(col("sales_affect"))\
    .count()\
    .show(truncate=False)



Monthly Sales Trend

In [None]:
monthly_sales_trend_df = curated_df\
    .withColumn("Date", to_date(col("Date"), "dd-MM-yyyy"))\
    .withColumn("month", date_format(col("Date"), "MM"))\
    .withColumn("year",date_format(col("Date"),"yyyy"))\
    .groupby(col("Store"),col("month"),col("year"))\
    .agg(sum("Weekly_Sales").alias("total_sales"))

monthly_sales_trend_df.show()

Impact of Temperature on Sales

In [None]:
curated_df\
    .show()

In [None]:
curated_df\
    .select('Weekly_Sales','Temperature','Date')\
    .withColumn("celsius_temp", (col("Temperature") - 32) * 5 / 9)\
    .withColumn("environment_type",when(col("celsius_temp") <= 4,'Very Cold')\
                .when(col("celsius_temp").between(4,8),'Cold')\
                .when(col("celsius_temp").between(8,13),'Cool')\
                .when(col("celsius_temp").between(13,23),'Comfortable')\
                .when(col("celsius_temp").between(23,29),'Slightly Warm')\
                .when(col("celsius_temp")>30,'Very Hot'))\
    .groupby(col("environment_type"))\
    .agg(sum("Weekly_Sales").alias("total_sales_by_environment"))

Sales Performance by Season

In [None]:
sales_performance_by_year_df = curated_df\
    .withColumn("Date", to_date(col("Date"), "dd-MM-yyyy"))\
    .withColumn("month", date_format(col("Date"), "MM"))\
    .withColumn("season_type",when(col("month").isin('12','01','02'),'Winter')\
                .when(col("month").isin('03','04','05'),'Spring')\
                .when(col("month").isin('06','07','08'),'Summer')\
                .when(col("month").isin('09','10','11'),'Fall'))\
    .groupby(col("Store"),col("season_type"))\
    .agg(avg("Weekly_Sales").alias("avg_sales_season_wise"))