In [None]:
# %pyspark
spark.version
from pyspark.sql.functions import *
from pyspark.sql.types import *


# %pyspark
# UPDATE weekly with weekending date of complete shipments data
latest_curr_date = '2020-05-02' # Since current week is incomplete and future weeks of shipments in actuals data are outliers! 


# %pyspark
# Import weeknumber to weekending mapping file. This includes only 2020 weeks till Aug 8th (scope of project). STATIC file, no need to update.
dates = spark.read.format('parquet').load("/user/bwn2456/CBDA/week_to_we_map.parquet")
# Read in raw shipments data
cbda_ship = spark.table("default.cbda_ship")
# For null weekending in cbda_ship (records which do not have corresponding sales in 2020), use weeknumber to weekending mapping file to pull in weekending
# Inner join with date mapping file since we are only considering model forecasts till Aug 1st week
cbda_ship = cbda_ship.join(dates.withColumnRenamed("date","date_ref"), how='inner', on='weekOfYear').withColumn("date", coalesce(col("date"),col("date_ref")))
# Read in shipments model forecasts
forecast_df = spark.read.format('parquet').option('header','true').load('/user/bwn2456/CBDA/ship_model_results.parquet.gzip')
forecast_df = forecast_df.withColumn("week_ending_date", next_day("week_ending_date", "saturday"))
print(forecast_df.count())
#z.show(forecast_df)


# %pyspark
from pyspark.sql.types import *
channel_mapping = spark.createDataFrame(
[
("01", "Distribution Channel 01"),
("10", "Other"),
("11", "DSD Bis Intercompany"),
("12", "DSD Pizza Intercomp"),
("20", """Warehouse/Exports"""),
("30", "Foodservice"),
("40", "DSD Pizza"),
("45", "DSD"),
("50", "KFI"),
("55", "Plant Ingredient"),
("60", "Imports"),
("65", "Bulk FS - Specialty"),
],
StructType([StructField('bic_zdistr_ch',StringType(), True), StructField('channel_desc',StringType(), True)]) # add your columns label here
)
#z.show(channel_mapping)


# %pyspark
# Retailer ID to retailer description mapping
agg_raw = spark.table("default.raw_ship_agg")
retailer_map = agg_raw.select("ac_scbm_id","ac_scbm_desc").distinct()


# %pyspark
# Rename columns and join in channel mapping
print(forecast_df.count())
forecast_df_cols = forecast_df.withColumnRenamed("week_ending_date","date").withColumnRenamed("retailer","ac_scbm_id").withColumnRenamed("retailer_desc","ac_scbm_desc").withColumnRenamed("mdlz_business","management_grouping_desc").withColumnRenamed("mdlz_category","product_category_name").withColumnRenamed("mdlz_brand","product_family_desc").withColumnRenamed("mdlz_ppg","promoted_product_group").withColumnRenamed("pos_qty","sales_pounds").withColumnRenamed("pos_dollar","gross_sales").withColumnRenamed("week_of_year","weekOfYear").withColumnRenamed("channel","bic_zdistr_ch")
print(forecast_df_cols.count())
forecast_joined = forecast_df_cols.join(channel_mapping, how='left',on=['bic_zdistr_ch'])
print(forecast_joined.count())
# z.show(forecast_joined)


# %pyspark
# Convert to date
forecast_joined = forecast_joined.withColumn("date", to_date("date"))


# %pyspark
# Pull only records in forecast df that do not overlap with cbda_ship (current / past df)
forecast_no_past = forecast_joined.filter(col("date") >= latest_curr_date).select("ac_scbm_id", "ac_scbm_desc", "state", "management_grouping_desc", "product_category_name", "product_segment_name", "product_family_desc", "promoted_product_group_desc", "promoted_product_group", "bic_zdistr_ch", "date", "forecast1", "forecast_quantity", "pos_qty_ly", "pos_dollar_ly","promo_uplift", "channel_desc").withColumnRenamed("forecast1","forecast_percent").withColumnRenamed("pos_qty_ly","pos_qty_ly_filled").withColumnRenamed("pos_dollar_ly","pos_dollar_ly_filled")
# Decide on the filter - did not have any effect previously - first value printed should be 0 !
#print(forecast_no_past.filter("state IS NULL OR ac_scbm_id IS NULL").head())
print(forecast_no_past.filter("state IS NULL OR ac_scbm_id IS NULL").count()) # should be 0
forecast_no_past = forecast_no_past.filter("state IS NOT NULL AND ac_scbm_id IS NOT NULL")
# Number of remaining records is ~1.3M on 5/02 data
print(forecast_no_past.count())


# %pyspark
# Bring in forecast columns (qty & %) into main df AND pull in values for the latest date for continuous visualization purposes
cbda_ship_w_fore_cols = cbda_ship.filter(col("date") <= latest_curr_date).join(forecast_no_past.filter(col("date") == latest_curr_date), how="left", on=["ac_scbm_id", "ac_scbm_desc","state", "management_grouping_desc", "product_category_name", "product_segment_name","product_family_desc", "promoted_product_group_desc", "promoted_product_group", "bic_zdistr_ch", "channel_desc","date"] )
# Include promo_uplift column
cbda_ship_w_fore_cols = cbda_ship_w_fore_cols.withColumn("promo_uplift", lit(0))


# %pyspark
# Prepare table for union by brining in addl columns
forecast_w_cbdaship_cols = forecast_no_past.filter(col("date") > latest_curr_date).join(cbda_ship.filter(col("date") < latest_curr_date), how="left", on=["ac_scbm_id", "ac_scbm_desc","state", "management_grouping_desc", "product_category_name", "product_segment_name","product_family_desc", "promoted_product_group_desc","promoted_product_group", "bic_zdistr_ch", "channel_desc","date"] ).filter(col("date") != latest_curr_date).select(*cbda_ship_w_fore_cols.columns)


# %pyspark
cbda_ship_w_forecasts = cbda_ship_w_fore_cols.union(forecast_w_cbdaship_cols).sort(["ac_scbm_id", "ac_scbm_desc","state", "management_grouping_desc", "product_category_name", "product_family_desc", "product_segment_name","promoted_product_group_desc", "promoted_product_group", "bic_zdistr_ch", "channel_desc","date"])
# Pull in LY numbers from 2019 corresponding to future weeks (i.e. April 4, 2019 and beyond till August 2019)
cbda_ship_w_forecasts = cbda_ship_w_forecasts.withColumn("stly_sales_pounds", coalesce(col("stly_sales_pounds"),col("pos_qty_ly_filled")).cast('double'))
cbda_ship_w_forecasts = cbda_ship_w_forecasts.withColumn("stly_gross_sales", coalesce(col("stly_gross_sales"),col("pos_dollar_ly_filled")).cast('double'))
cbda_ship_w_forecasts = cbda_ship_w_forecasts.drop("pos_qty_ly_filled","pos_dollar_ly_filled")
# For Tableau visualization - pull in actuals for forecasts field for the latest actual week
cbda_ship_w_forecasts = cbda_ship_w_forecasts.withColumn("forecast_quantity", when(col("date") == latest_curr_date, col("sales_pounds")).otherwise(col("forecast_quantity")))
# For Tableau visualization - Fill NAs with 0 for promo_uplift
cbda_ship_w_forecasts = cbda_ship_w_forecasts.fillna(0, subset=['promo_uplift'])


# %pyspark
# Pull in correct last year numbers
cbda_ship_ly = cbda_ship.select("ac_scbm_id", "ac_scbm_desc","state", "management_grouping_desc", "product_category_name", "product_family_desc", "product_segment_name", "promoted_product_group_desc", "promoted_product_group", "bic_zdistr_ch", "channel_desc","date","stly_sales_pounds","stly_gross_sales")
cbda_ship_w_forecasts_and2019Only = cbda_ship_w_forecasts.drop("stly_sales_pounds","stly_gross_sales").join(cbda_ship_ly, how="outer", on=["ac_scbm_id", "ac_scbm_desc","state", "management_grouping_desc", "product_category_name", "product_family_desc", "product_segment_name", "promoted_product_group_desc", "promoted_product_group", "bic_zdistr_ch", "channel_desc","date"])
# For Tableau visualization - Fill NAs with 0 for promo_uplift
cbda_ship_w_forecasts_and2019Only = cbda_ship_w_forecasts_and2019Only.fillna(0, subset=['promo_uplift'])


# %pyspark
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql import functions as F
# Correct MCLANE and DOT TOTAL
cbda_ship_w_forecasts_and2019Only = cbda_ship_w_forecasts_and2019Only.withColumn("forecast_quantity", when(expr("(state = 'IL' or state = 'AR') AND (ac_scbm_desc = 'MCLANE' or ac_scbm_desc = 'DOT TOTAL')") & (col("date") > lit(latest_curr_date)), col("stly_sales_pounds")).otherwise(col("forecast_quantity")))
cbda_ship_w_forecasts_and2019Only = cbda_ship_w_forecasts_and2019Only.withColumn("forecast_quantity", when((col("bic_zdistr_ch") == '45') & (col("ac_scbm_desc") == "WALMART SC/DISCOUNT") & (col("promoted_product_group_desc") == 'FAMILY SIZE OREO PPG') & (col("date") > latest_curr_date), expr("forecast_quantity - ((forecast_quantity - stly_sales_pounds)/2) ")).otherwise(col("forecast_quantity")))
# Correct ECOMM
windowval = (Window.partitionBy("ac_scbm_id", "ac_scbm_desc","state", "management_grouping_desc", "product_category_name", "product_family_desc", "product_segment_name", "promoted_product_group_desc", "promoted_product_group", "bic_zdistr_ch", "channel_desc").orderBy('date')
             .rangeBetween(Window.unboundedPreceding, 0))
cbda_ship_w_forecasts_w_cumsum = cbda_ship_w_forecasts_and2019Only.withColumn('cum_sum', F.sum('forecast_quantity').over(windowval))
cbda_ship_w_forecasts_and2019Only = cbda_ship_w_forecasts_w_cumsum.withColumn('forecast_quantity', when( expr("ac_scbm_desc = 'ECOMM DIRECT' AND channel_desc = 'DSD'") , col("cum_sum")).otherwise(col("forecast_quantity"))).drop("cum_sum")
# Temp Fix for infinity value
cbda_ship_w_forecasts_and2019Only = cbda_ship_w_forecasts_and2019Only.filter("NOT(ac_scbm_id = 'US3000050' AND state = 'GA' AND promoted_product_group = 'LM9' AND bic_zdistr_ch = '45')")


# %pyspark
print(cbda_ship_w_forecasts_and2019Only.count())


# %pyspark
# Write out for Tableau
cbda_ship_w_forecasts_and2019Only.createOrReplaceTempView("cbda_ship_w_forecasts")
spark.sql("drop table if exists default.cbda_ship_w_forecasts")
spark.sql("create table default.cbda_ship_w_forecasts as select * from cbda_ship_w_forecasts")