In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

print("ECB FX RATES - BRONZE TO SILVER TRANSFORMATION")

print("\nReading Bronze FX data...")
df_bronze = spark.table("bronze_ecb_fx")
print(f"Bronze records: {df_bronze.count():,}")

print("\nStandardizing schema...")
df_silver = df_bronze.toDF(*[c.lower() for c in df_bronze.columns])

df_silver = df_silver.filter(
    (year("date") >= 2019) & 
    (year("date") <= 2024)
)

print("\nData quality checks...")
date_range = df_silver.agg(min('date'), max('date')).collect()[0]
print(f"Date range: {date_range[0]} to {date_range[1]}")

print("\nYear coverage:")
df_silver.groupBy(year("date").alias("year")) \
    .agg(count("*").alias("trading_days")) \
    .orderBy("year") \
    .show()

null_counts = df_silver.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in ["date", "usd_eur_rate"]
])
print("Null counts:")
null_counts.show()

print("\nAdding time dimensions...")
df_silver = df_silver \
    .withColumn("year", year("date")) \
    .withColumn("month", month("date")) \
    .withColumn("day", dayofmonth("date")) \
    .withColumn("dayofweek", dayofweek("date")) \
    .withColumn("dayname", date_format("date", "EEEE")) \
    .withColumn("is_weekend", when(col("dayofweek").isin([1, 7]), True).otherwise(False)) \
    .withColumn("usd_eur_rate", round(col("usd_eur_rate"), 4)) \
    .withColumn("ingestion_timestamp", current_timestamp())

print("\nData validation...")
summary = df_silver.agg(
    count("*").alias("total_days"),
    round(avg("usd_eur_rate"), 4).alias("avg_rate"),
    round(min("usd_eur_rate"), 4).alias("min_rate"),
    round(max("usd_eur_rate"), 4).alias("max_rate"),
    min("date").alias("first_date"),
    max("date").alias("last_date")
).collect()[0]

print(f"   Total trading days: {summary['total_days']}")
print(f"   Average rate: {summary['avg_rate']}")
print(f"   Min rate: {summary['min_rate']}")
print(f"   Max rate: {summary['max_rate']}")
print(f"   Date range: {summary['first_date']} to {summary['last_date']}")

print("\nYear-by-year summary:")
df_silver.groupBy("year") \
    .agg(
        count("*").alias("trading_days"),
        round(avg("usd_eur_rate"), 4).alias("avg_rate"),
        round(min("usd_eur_rate"), 4).alias("min_rate"),
        round(max("usd_eur_rate"), 4).alias("max_rate")
    ) \
    .orderBy("year") \
    .show()

print("\nSample data:")
df_silver.select("date", "currency", "currency_denom", "usd_eur_rate", "dayname") \
    .orderBy("date") \
    .show(10)


print("\nSaving to Silver...")
df_silver.write.mode("overwrite").format("delta").saveAsTable("silver_ecb_fx_rates")

df_silver.createOrReplaceTempView("silver_ecb_fx")
print(f"View: silver_ecb_fx")

print("ECB FX SILVER COMPLETE!")

StatementMeta(, df4a5686-a157-44e2-a203-bfa9a431ad43, 3, Finished, Available, Finished)

ECB FX RATES - BRONZE TO SILVER TRANSFORMATION

Reading Bronze FX data...
Bronze records: 1,794

Standardizing schema...

Data quality checks...
Date range: 2019-01-02 to 2024-12-31

Year coverage:
+----+------------+
|year|trading_days|
+----+------------+
|2019|         255|
|2020|         257|
|2021|         258|
|2022|         257|
|2023|         255|
|2024|         256|
+----+------------+

Null counts:
+----+------------+
|date|usd_eur_rate|
+----+------------+
|   0|           0|
+----+------------+


Adding time dimensions...

Data validation...
   Total trading days: 1538
   Average rate: 1.1103
   Min rate: 0.9565
   Max rate: 1.2338
   Date range: 2019-01-02 to 2024-12-31

Year-by-year summary:
+----+------------+--------+--------+--------+
|year|trading_days|avg_rate|min_rate|max_rate|
+----+------------+--------+--------+--------+
|2019|         255|  1.1195|  1.0889|  1.1535|
|2020|         257|  1.1422|  1.0707|  1.2281|
|2021|         258|  1.1827|  1.1206|  1.2338|
|20