In [0]:
# %pip install pandas requests  # (if needed)
from pyspark.sql import functions as F
from datetime import date

# Paths (use FileStore so you can inspect files if needed)
PROJECT_BASE = "dbfs:/FileStore/germany_import_cost_fx_impact"
BRONZE_BASE  = f"{PROJECT_BASE}/bronze"

ECb_DELTA_PATH = f"{BRONZE_BASE}/ecb_fx_rates"

# Imports from scripts
import sys, os
# If running in Databricks Repos, path usually added; else manually:
# sys.path.append("/Workspace/Repos/<your_repo>/scripts")
from scripts.api_clients import fetch_ecb_fx_daily
from scripts.utils import with_ingest_meta, write_delta


In [0]:
pdf = fetch_ecb_fx_daily(
    symbols=("USD","JPY","CNY"),
    start="2018-01-01",
    end="2024-12-31"
)
assert set(pdf['currency'].unique()) == {"USD","JPY","CNY"}
assert pdf['fx_rate'].notna().any(), "No FX values returned."

sdf = spark.createDataFrame(pdf)
sdf = with_ingest_meta(sdf, src="ECB_SDMX_EXR_D_SP00_A")
display(sdf.limit(10))


In [0]:
from pyspark.sql import functions as F

# 1) Quick profiling (see what actually failed)
dq_profile = (sdf.groupBy("currency")
    .agg(
        F.count("*").alias("rows"),
        F.sum(F.when(F.col("fx_rate").isNull(), 1).otherwise(0)).alias("nulls"),
        F.min("fx_rate").alias("min_fx"),
        F.max("fx_rate").alias("max_fx")
    )
)
display(dq_profile)

# 2) Currency-aware range checks (prices are units of target currency per 1 EUR)
#    USD ≈ 0.8–1.3, CNY ≈ 6–9, JPY ≈ 100–200 in 2018–2024
ranges = {
    "USD": (0.5, 2.0),
    "CNY": (5.0, 12.0),
    "JPY": (80.0, 300.0),
}

violations = None
for cur, (lo, hi) in ranges.items():
    v = sdf.filter((F.col("currency") == cur) & (F.col("fx_rate").isNotNull()) &
                   ((F.col("fx_rate") < lo) | (F.col("fx_rate") > hi)))
    violations = v if violations is None else violations.unionByName(v)

# 3) Assert on non-positive values only; range violations raise a clearer message
assert sdf.filter((F.col("fx_rate") <= 0) & F.col("fx_rate").isNotNull()).count() == 0, \
    "Found non-positive fx_rate values."

if violations and violations.count() > 0:
    raise AssertionError("FX rates outside expected currency-specific ranges. Inspect 'violations' DF.")

# 4) Nulls are acceptable at Bronze (holidays, missing obs). We'll handle them in Silver via monthly avg.
null_rate_ok = (dq_profile.withColumn("null_rate", F.col("nulls")/F.col("rows"))
                .filter("null_rate > 0.1").count() == 0)
assert null_rate_ok, "More than 10% nulls for at least one currency—check source/params."


In [0]:
# Cell 4 — write as a managed table (no FileStore)
from pyspark.sql import functions as F

# Create a schema (DB) in the default metastore
spark.sql("CREATE DATABASE IF NOT EXISTS fx_impact")

# Write as a managed Delta table, partitioned by currency
(
  sdf.write
     .format("delta")
     .mode("overwrite")
     .partitionBy("currency")
     .saveAsTable("fx_impact.bronze_ecb_fx_rates")
)

print("✅ Wrote managed Delta table: fx_impact.bronze_ecb_fx_rates")



In [0]:
# Cell 5 — Read-back & smoke tests

from pyspark.sql import functions as F

bronze_fx = spark.table("fx_impact.bronze_ecb_fx_rates")

display(
  bronze_fx.groupBy("currency")
           .agg(F.min("date").alias("min_date"),
                F.max("date").alias("max_date"),
                F.count("*").alias("rows"))
           .orderBy("currency")
)

display(bronze_fx.orderBy(F.desc("date")).limit(10))
