In [0]:
# COVID impact analysis (Pre vs During vs Post)
# Inputs:
#   fx_impact.gold_monthly_metrics
#   fx_impact.gold_monthly_totals
# Outputs (managed Delta):
#   fx_impact.gold_covid_period_kpis_total     -- overall EUR KPIs by period
#   fx_impact.gold_covid_period_kpis_by_cmd    -- per-commodity EUR KPIs + deltas
#   fx_impact.gold_covid_top_movers            -- ranked changes by commodity


In [0]:
# Cell 1 — Parameters & helpers
from pyspark.sql import functions as F

# Period cutoffs
PRE_END    = "2019-12-31"   # Pre-COVID period: <= 2019-12-31
DUR_END    = "2021-12-31"   # During-COVID:   2020-01-01 .. 2021-12-31
# Post-COVID:                >= 2022-01-01

INCLUDE_2025_YTD = True     # keep Post period as 2022..max(month), including 2025 YTD
STRICT_POST_2018_2024 = False  # set True if you want Post fixed as 2022..2024 only


In [0]:
# Cell 2 — Month flags (period labels + YTD)
# Use months present in gold_monthly_totals as the master calendar for Gold
m = spark.table("fx_impact.gold_monthly_totals").select("month").distinct()

perioded = (m
  .withColumn("covid_period",
      F.when(F.col("month") <= F.to_date(F.lit(PRE_END)), "Pre-COVID")
       .when(F.col("month") <= F.to_date(F.lit(DUR_END)), "During COVID")
       .otherwise("Post-COVID"))
  .withColumn("year", F.year("month"))
  .withColumn("is_2025", (F.year("month") == 2025).cast("int"))
)

if STRICT_POST_2018_2024:
    perioded = perioded.filter(~((F.col("covid_period")=="Post-COVID") & (F.col("year") >= 2025)))

if not INCLUDE_2025_YTD:
    perioded = perioded.filter(F.col("year") <= 2024)

perioded.createOrReplaceTempView("dim_months_perioded")
spark.sql("SELECT MIN(month) min_m, MAX(month) max_m, COUNT(*) n FROM dim_months_perioded").show()
spark.sql("SELECT covid_period, MIN(month) min_m, MAX(month) max_m, COUNT(*) n FROM dim_months_perioded GROUP BY covid_period ORDER BY min_m").show()


In [0]:
# Cell 3 — Overall KPIs by period (totals)
spark.sql("""
CREATE OR REPLACE TEMP VIEW kpi_total AS
SELECT
  d.covid_period,
  COUNT(*)                           AS months,
  AVG(t.total_import_eur)            AS avg_monthly_eur,
  AVG(t.total_import_usd)            AS avg_monthly_usd,
  AVG(t.avg_usd_per_eur)             AS avg_usd_per_eur
FROM dim_months_perioded d
JOIN fx_impact.gold_monthly_totals t USING (month)
GROUP BY d.covid_period
""")

spark.sql("""
CREATE OR REPLACE TABLE fx_impact.gold_covid_period_kpis_total AS
WITH p AS (
  SELECT
    MAX(CASE WHEN covid_period='Pre-COVID'    THEN avg_monthly_eur END) AS pre_eur,
    MAX(CASE WHEN covid_period='During COVID' THEN avg_monthly_eur END) AS during_eur,
    MAX(CASE WHEN covid_period='Post-COVID'   THEN avg_monthly_eur END) AS post_eur,
    MAX(CASE WHEN covid_period='Pre-COVID'    THEN months END) AS pre_m,
    MAX(CASE WHEN covid_period='During COVID' THEN months END) AS during_m,
    MAX(CASE WHEN covid_period='Post-COVID'   THEN months END) AS post_m
  FROM kpi_total
)
SELECT
  pre_eur, during_eur, post_eur,
  pre_m, during_m, post_m,
  CASE WHEN pre_eur > 0 THEN (during_eur - pre_eur) / pre_eur END AS delta_during_vs_pre_pct,
  CASE WHEN pre_eur > 0 THEN (post_eur   - pre_eur) / pre_eur END AS delta_post_vs_pre_pct
FROM p
""")

spark.sql("SELECT * FROM fx_impact.gold_covid_period_kpis_total").show()


In [0]:
# Cell 4 — Per-commodity KPIs & deltas
spark.sql("""
CREATE OR REPLACE TEMP VIEW kpi_by_cmd AS
SELECT
  m.covid_period,
  g.cmdCode,
  COALESCE(MAX(g.cmdDesc), g.cmdCode) AS cmdDesc,
  AVG(g.import_eur)                   AS avg_monthly_eur_cmd
FROM dim_months_perioded m
JOIN fx_impact.gold_monthly_metrics g USING (month)
GROUP BY m.covid_period, g.cmdCode
""")

spark.sql("""
CREATE OR REPLACE TABLE fx_impact.gold_covid_period_kpis_by_cmd AS
WITH p AS (
  SELECT
    cmdCode,
    MAX(cmdDesc) AS cmdDesc,
    MAX(CASE WHEN covid_period='Pre-COVID'    THEN avg_monthly_eur_cmd END) AS pre_eur,
    MAX(CASE WHEN covid_period='During COVID' THEN avg_monthly_eur_cmd END) AS during_eur,
    MAX(CASE WHEN covid_period='Post-COVID'   THEN avg_monthly_eur_cmd END) AS post_eur
  FROM kpi_by_cmd
  GROUP BY cmdCode
)
SELECT
  cmdCode, cmdDesc,
  pre_eur, during_eur, post_eur,
  (during_eur - pre_eur)                 AS abs_change_during_vs_pre,
  (post_eur   - pre_eur)                 AS abs_change_post_vs_pre,
  CASE WHEN pre_eur > 0 THEN (during_eur - pre_eur)/pre_eur END AS pct_change_during_vs_pre,
  CASE WHEN pre_eur > 0 THEN (post_eur   - pre_eur)/pre_eur END AS pct_change_post_vs_pre
FROM p
""")

spark.sql("SELECT * FROM fx_impact.gold_covid_period_kpis_by_cmd ORDER BY abs_change_post_vs_pre DESC NULLS LAST LIMIT 10").show(truncate=False)


In [0]:
# Cell 5 — Top movers (ranked tables for BI)
spark.sql("""
CREATE OR REPLACE TABLE fx_impact.gold_covid_top_movers AS
SELECT
  cmdCode, cmdDesc,
  abs_change_during_vs_pre, pct_change_during_vs_pre,
  abs_change_post_vs_pre,   pct_change_post_vs_pre,
  DENSE_RANK() OVER (ORDER BY abs_change_post_vs_pre DESC NULLS LAST)  AS rank_post_abs_up,
  DENSE_RANK() OVER (ORDER BY abs_change_post_vs_pre ASC  NULLS LAST)  AS rank_post_abs_down,
  DENSE_RANK() OVER (ORDER BY pct_change_post_vs_pre  DESC NULLS LAST) AS rank_post_pct_up,
  DENSE_RANK() OVER (ORDER BY pct_change_post_vs_pre  ASC  NULLS LAST) AS rank_post_pct_down
FROM fx_impact.gold_covid_period_kpis_by_cmd
""")

spark.sql("""
SELECT * FROM fx_impact.gold_covid_top_movers
WHERE rank_post_abs_up <= 5 OR rank_post_abs_down <= 5
ORDER BY rank_post_abs_up, rank_post_abs_down
""").show(truncate=False)


In [0]:
# Cell 6 — Quick QA
# Period coverage sanity
spark.sql("""
SELECT covid_period, COUNT(*) months
FROM dim_months_perioded GROUP BY covid_period ORDER BY covid_period
""").show()

# Check that KPIs by cmd have pre values (guard divisions)
spark.sql("""
SELECT COUNT(*) missing_pre
FROM fx_impact.gold_covid_period_kpis_by_cmd
WHERE pre_eur IS NULL
""").show()
