In [3]:
df_raw_fx = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("Files/bronze/ecb/usd_eur_fx")
)

df_raw_fx.printSchema()



StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 5, Finished, Available, Finished)

root
 |-- KEY: string (nullable = true)
 |-- FREQ: string (nullable = true)
 |-- CURRENCY: string (nullable = true)
 |-- CURRENCY_DENOM: string (nullable = true)
 |-- EXR_TYPE: string (nullable = true)
 |-- EXR_SUFFIX: string (nullable = true)
 |-- TIME_PERIOD: date (nullable = true)
 |-- OBS_VALUE: double (nullable = true)
 |-- OBS_STATUS: string (nullable = true)
 |-- OBS_CONF: string (nullable = true)
 |-- OBS_PRE_BREAK: string (nullable = true)
 |-- OBS_COM: string (nullable = true)
 |-- TIME_FORMAT: string (nullable = true)
 |-- BREAKS: string (nullable = true)
 |-- COLLECTION: string (nullable = true)
 |-- COMPILING_ORG: string (nullable = true)
 |-- DISS_ORG: string (nullable = true)
 |-- DOM_SER_IDS: string (nullable = true)
 |-- PUBL_ECB: string (nullable = true)
 |-- PUBL_MU: string (nullable = true)
 |-- PUBL_PUBLIC: string (nullable = true)
 |-- UNIT_INDEX_BASE: string (nullable = true)
 |-- COMPILATION: string (nullable = true)
 |-- COVERAGE: string (nullable = true)
 |-- 

In [4]:
from pyspark.sql import functions as F

df_silver_fx = (
    df_raw_fx
    .select(
        F.to_date("TIME_PERIOD").alias("date"),
        F.col("OBS_VALUE").cast("double").alias("usd_eur_rate")
    )
    .filter(
        F.col("date").isNotNull() &
        F.col("usd_eur_rate").isNotNull()
    )
)


StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 6, Finished, Available, Finished)

In [5]:
df_silver_fx.limit(5).show()


StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 7, Finished, Available, Finished)

+----------+------------+
|      date|usd_eur_rate|
+----------+------------+
|1999-01-04|      1.1789|
|1999-01-05|       1.179|
|1999-01-06|      1.1743|
|1999-01-07|      1.1632|
|1999-01-08|      1.1659|
+----------+------------+



In [6]:
df_silver_fx = df_silver_fx.withColumn(
    "year", F.year("date")
)


StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 8, Finished, Available, Finished)

In [7]:
df_gold_fx_year = (
    df_silver_fx
    .groupBy("year")
    .agg(
        F.avg("usd_eur_rate").alias("avg_usd_eur_rate")
    )
)


StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 9, Finished, Available, Finished)

In [8]:
df_gold_fx_year.limit(5).show()


StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 10, Finished, Available, Finished)

+----+------------------+
|year|  avg_usd_eur_rate|
+----+------------------+
|2025| 1.129438095238096|
|2003|1.1311603921568625|
|2007|1.3704780392156868|
|2018|1.1809545098039211|
|2015|1.1095128906249994|
+----+------------------+



In [9]:
df_dim_fx = (
    df_silver_fx
    .groupBy("year")
    .agg(
        F.avg("usd_eur_rate").alias("avg_usd_eur_rate")
    )
)

df_dim_fx.show(5)


StatementMeta(, 8962e162-2c77-4905-a12d-6f989c4b2f09, 11, Finished, Available, Finished)

+----+------------------+
|year|  avg_usd_eur_rate|
+----+------------------+
|2025| 1.129438095238096|
|2003|1.1311603921568625|
|2007|1.3704780392156868|
|2018|1.1809545098039211|
|2015|1.1095128906249994|
+----+------------------+
only showing top 5 rows



In [9]:
(
    df_gold_fx_year
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("dbo.dim_fx")
)


StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 11, Finished, Available, Finished)

In [10]:
spark.sql("""
SELECT * FROM dbo.dim_fx ORDER BY year;
""").show()

StatementMeta(, 63e334be-6e4d-4d47-9712-05dc05c5ac48, 12, Finished, Available, Finished)

+----+------------------+
|year|  avg_usd_eur_rate|
+----+------------------+
|1999|1.0657764478764475|
|2000| 0.923612549019608|
|2001|0.8956295275590546|
|2002|0.9455737254901957|
|2003|1.1311603921568625|
|2004|1.2439023166023155|
|2005|1.2440902723735408|
|2006|1.2555988235294129|
|2007|1.3704780392156868|
|2008|     1.47075546875|
|2009|1.3947824218749991|
|2010| 1.325716666666667|
|2011|1.3919552529182875|
|2012|1.2847886718750006|
|2013|1.3281180392156866|
|2014|1.3285007843137244|
|2015|1.1095128906249994|
|2016|1.1069031128404672|
|2017|1.1296811764705885|
|2018|1.1809545098039211|
+----+------------------+
only showing top 20 rows

