In [None]:
# %pyspark
# Import libraries
spark.version
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

# %pyspark
# Read in POS raw data from etl script
cbda_pos = spark.table("default.cbda_pos")

# Read in POS model results
forecast_df = spark.read.format('csv').option('header','true').load('/user/herlt10/CBDA/pos_model_results.csv')

# %pyspark
z.show(cbda_pos.select("retailer", "state", "mdlz_business", "mdlz_category", "mdlz_brand", "mdlz_ppg").distinct().count())

# %pyspark
z.show(forecast_df.select("retailer", "state", "mdlz_business", "mdlz_category", "mdlz_brand", "mdlz_ppg").distinct().count())

# %pyspark
# forecast_df transformations

# Convert to date
forecast_df = forecast_df.withColumn("week_ending_date", to_date("week_ending_date"))

# Date filter for latest available data - DATE PRINTED MUST CHANGE WITH DATA REFRESH
latest_curr_date = cbda_pos.filter("retailer IS NOT NULL").select("week_ending_date").distinct().sort(['week_ending_date'], ascending=False).first().week_ending_date
print(latest_curr_date)

# Use concatenated ID to populate state and retailer (esp. for future weeks)
split_col = split(forecast_df['sell_id'], '_')
forecast_df = forecast_df.withColumn('state', split_col.getItem(4))
forecast_df = forecast_df.withColumn('retailer', split_col.getItem(5))

z.show(forecast_df.filter(col("week_ending_date") >= latest_curr_date))

# %pyspark
# Pull only records in forecast df that do not overlap with cbda_pos (current / past df)
forecast_no_past = forecast_df.filter(col("week_ending_date") >= latest_curr_date).select("retailer", "state", "mdlz_business", "mdlz_category", "mdlz_brand", "mdlz_ppg", "week_ending_date", "forecast1", "forecast_quantity", "pos_qty_ly", "pos_dollar_ly","promo_uplift").withColumnRenamed("forecast1","forecast_percent").withColumnRenamed("pos_qty_ly","pos_qty_ly_filled").withColumnRenamed("pos_dollar_ly","pos_dollar_ly_filled")

# Did not have any effect previously - first value printed should be 0 !
print(forecast_no_past.filter("state IS NULL OR retailer IS NULL").count()) # should be 0
forecast_no_past = forecast_no_past.filter("state IS NOT NULL AND retailer IS NOT NULL")

# Number of remaining records is ~558K on 05/04
print(forecast_no_past.count())

z.show(forecast_no_past)

# %pyspark
# Bring in forecast columns (qty & %) into main df AND pull in values for the latest date for continuous visualization purposes
cbda_pos_w_fore_cols = cbda_pos.join(forecast_no_past.filter(col("week_ending_date") == latest_curr_date), how="left", on=["retailer", "state", "mdlz_business", "mdlz_category", "mdlz_brand", "mdlz_ppg", "week_ending_date"] )

# Include promo_uplift column
cbda_pos_w_fore_cols = cbda_pos_w_fore_cols.withColumn("promo_uplift", lit(0)).withColumn("forecast_percent", col("forecast_percent").cast("string")).withColumn("forecast_quantity", col("forecast_quantity").cast("string")).withColumn("pos_qty_ly_filled", col("pos_qty_ly_filled").cast("string")).withColumn("pos_dollar_ly_filled", col("pos_dollar_ly_filled").cast("string"))

z.show(cbda_pos_w_fore_cols)

# %pyspark
# Forecast DF with additional columns
forecast_w_cbdapos_cols = forecast_no_past.join(cbda_pos.filter(col("week_ending_date") < latest_curr_date), how="left", on=["retailer", "state", "mdlz_business", "mdlz_category", "mdlz_brand", "mdlz_ppg", "week_ending_date"] ).filter(col("week_ending_date") != latest_curr_date).select(*cbda_pos_w_fore_cols.columns)

forecast_w_cbdapos_cols = forecast_w_cbdapos_cols.withColumn("forecast_percent", col("forecast_percent").cast("string")).withColumn("forecast_quantity", col("forecast_quantity").cast("string")).withColumn("pos_qty_ly_filled", col("pos_qty_ly_filled").cast("string")).withColumn("pos_dollar_ly_filled", col("pos_dollar_ly_filled").cast("string")).withColumn("promo_uplift", col("promo_uplift").cast("string")).withColumn("week_ending_date", col("week_ending_date").cast("string"))

z.show(forecast_w_cbdapos_cols)

# %pyspark
cbda_pos_w_forecasts = cbda_pos_w_fore_cols.union(forecast_w_cbdapos_cols).sort(["retailer", "state", "mdlz_business", "mdlz_category", "mdlz_brand", "mdlz_ppg", "week_ending_date"])

# Pull in LY numbers from 2019 corresponding to future weeks (i.e. April 4, 2019 and beyond till August 2019)
cbda_pos_w_forecasts = cbda_pos_w_forecasts.withColumn("pos_qty_ly", coalesce(col("pos_qty_ly"),col("pos_qty_ly_filled")).cast('double'))
cbda_pos_w_forecasts = cbda_pos_w_forecasts.withColumn("pos_dollar_ly", coalesce(col("pos_dollar_ly"),col("pos_dollar_ly_filled")).cast('double'))
cbda_pos_w_forecasts = cbda_pos_w_forecasts.drop("pos_qty_ly_filled","pos_dollar_ly_filled")

# For Tableau visualization - Fill NAs with 0 for promo_uplift
cbda_pos_w_forecasts = cbda_pos_w_forecasts.fillna(0, subset=['promo_uplift'])

# Correct Candy Seasonal - can be added back into Python model - do not update the date since it represents the seasonality of the category
cbda_pos_w_forecasts = cbda_pos_w_forecasts.withColumn("forecast_quantity",when( expr("mdlz_category IN ('Candy Seasonal') AND (forecast_quantity IS NOT NULL)"), lit(0) ).otherwise(col("forecast_quantity"))).withColumn("promo_uplift",when( expr("mdlz_category IN ('Candy Seasonal') AND (forecast_quantity IS NOT NULL)"), lit(0) ).otherwise(col("promo_uplift")))

# For Tableau visualization - pull in actuals for forecasts field for the latest actual week
cbda_pos_w_forecasts_out = cbda_pos_w_forecasts.withColumn("forecast_quantity", when(col("week_ending_date") == latest_curr_date, col("pos_qty_ty")).otherwise(col("forecast_quantity")))

z.show(cbda_pos_w_forecasts_out)

# %pyspark
z.show(cbda_pos_w_forecasts.count())

# %pyspark
# Write out into Hive table for Tableau
cbda_pos_w_forecasts_out.createOrReplaceTempView("cbda_pos_w_forecasts")
spark.sql("drop table if exists default.cbda_pos_w_forecasts")
spark.sql("create table default.cbda_pos_w_forecasts as select * from cbda_pos_w_forecasts")