In [0]:
%sql
USE cscie103_catalog.final_project

In [0]:
from pyspark.sql import functions as F

In [0]:

# Load tables as DataFrames
stores_df = spark.table("cscie103_catalog.final_project.bronze_stores")
gold_df = spark.table("cscie103_catalog.final_project.gold_daily_store_family")
test_df = spark.table("cscie103_catalog.final_project.silver_test_predictions")
val_df  = spark.table("cscie103_catalog.final_project.silver_validation_predictions")

In [0]:
# Add city based on store_nbr
test_city_df = test_df.join(stores_df, on='store_nbr', how='left')
val_city_df = val_df.join(stores_df, on='store_nbr', how='left')

In [0]:
# Actual sales
actual_sales_df = (
    gold_df
    .select(
        F.col("date"),
        F.col("store_nbr"),
        F.col("family"),
        F.col("city"),
        F.col("sales").alias("actual_sales"),
        F.lit("Actual").alias("data_type")
    )
)

In [0]:
# Test forecasts
test_predictions_df = (
    test_city_df
    .select(
        F.col("date"),
        F.col("store_nbr"),
        F.col("family"),
        F.col("city"),
        F.col("predicted_sales").alias("actual_sales"),
        F.concat(F.lit("Forecast - "), F.col("scenario")).alias("data_type")
    )
)
# test_predictions_df.show()

In [0]:
# Validation forecasts
validation_predictions_df = (
    val_city_df
    .select(
        F.col("date"),
        F.col("store_nbr"),
        F.col("family"),
        F.col("city"),
        F.col("predicted_sales").alias("actual_sales"),
        F.lit("Validation").alias("data_type")
    )
)
# validation_predictions_df.show()


In [0]:
# Union all and sort
sales_and_forecast_df = (
    actual_sales_df
    .unionByName(test_predictions_df)
    .unionByName(validation_predictions_df)
    .orderBy("date", "family", "city", "store_nbr", "data_type")
)
# sales_and_forecast_df.show()


In [0]:
sales_and_forecast_df.count()

In [0]:
# Validation to check the dates that are including in each data_type
sales_and_forecast_df.toPandas().pivot_table(index='data_type', values='date', aggfunc=['min', 'max'])

In [0]:
# Get the latest date from the dataset
latest_date = sales_and_forecast_df.agg(F.max('date')).collect()[0][0]

# Calculate date filters based on latest date
latest_year = latest_date.year
latest_month = latest_date.month

# Filter for data up to the latest date's year and month
combined_sales_by_segments = sales_and_forecast_df.filter(
    (F.year(F.col('date')) >= latest_year - 1) & 
    (F.month(F.col('date')) <= latest_month)
).groupBy(
    F.year(F.col('date')).alias('year'), 
    # F.month(F.col('date')).alias('month'), 
    'store_nbr', 
    'family', 
    'city', 'data_type'
).agg(
    F.sum('actual_sales').alias('total_sales')
)

# display(combined_sales_by_segments)

In [0]:
combined_sales_by_segments.toPandas()['data_type'].unique()

In [0]:
# combined_sales_by_segments.toPandas()[['year', 'month']].drop_duplicates().sort_values(by=['year', 'month'])

In [0]:
from pyspark.sql import functions as F

# Define the scenarios
scenarios = [
    # ('Actual', ['Actual']),
    ('1x_promos', ['Actual', 'Forecast - current']),
    ('2x_promos', ['Actual', 'Forecast - onpromotion_2x'])
]

# Create datasets for each scenario and union them
scenario_datasets = []

for scenario_name, data_types in scenarios:
    scenario_df = combined_sales_by_segments.filter(
        F.col('data_type').isin(data_types)
    ).groupBy(
        'year', 'store_nbr', 'family', 'city'
    ).agg(
        F.sum('total_sales').alias('total_sales')
    ).withColumn('scenario', F.lit(scenario_name))
    
    scenario_datasets.append(scenario_df)

# Union all scenarios into one dataset
final_dataset = scenario_datasets[0]
for df in scenario_datasets[1:]:
    final_dataset = final_dataset.union(df)

In [0]:
# %%
final_dataset_agg = final_dataset.toPandas().pivot_table(index=['scenario', 'city'], columns='year', values='total_sales').reset_index()
final_dataset_agg['ytd_growth'] = (final_dataset_agg[latest_year] / final_dataset_agg[latest_year - 1] - 1)*100
spark.createDataFrame(final_dataset_agg).write.format("delta").mode("overwrite").saveAsTable("platinum_city_cagr")
# %%
final_dataset_agg = final_dataset.toPandas().pivot_table(index=['scenario', 'family'], columns='year', values='total_sales').reset_index()
final_dataset_agg['ytd_growth'] = (final_dataset_agg[latest_year] / final_dataset_agg[latest_year - 1] - 1)*100
spark.createDataFrame(final_dataset_agg).write.format("delta").mode("overwrite").saveAsTable("platinum_family_cagr")

In [0]:
spark.sql("DROP TABLE IF EXISTS cscie103_catalog.final_project.platinum_family_cagr_delta")

In [0]:
final_dataset_agg_summary = final_dataset_agg.pivot_table(values='ytd_growth', index='family', columns='scenario').reset_index()
final_dataset_agg_summary['diff'] = final_dataset_agg_summary['2x_promos'] - final_dataset_agg_summary['1x_promos']
spark.createDataFrame(final_dataset_agg_summary.sort_values(by='diff', ascending=False)).write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("platinum_family_cagr_delta")