In [0]:
%run "/Workspace/Users/samuel.barroscatarino@educ.sasserno.fr/musicstreamapp/databricks/01_Initialize_Setting"

## 1. Dimension: Songs

In [0]:
songs_raw = (spark.read.csv(f"{raw_path}/songs.csv", header=True)
             .select(F.col("song_id").alias("songId"), 
                     F.col("title"), 
                     F.col("artist_name").alias("artistName"),
                     F.col("duration").cast(T.DoubleType()),
                     F.col("key").cast(T.IntegerType()),
                     F.col("key_confidence").cast(T.DoubleType()).alias("keyConfidence"),
                     F.col("loudness").cast(T.DoubleType()),
                     F.col("song_hotttnesss").cast(T.DoubleType()).alias("songHotness"),
                     F.col("tempo").cast(T.DoubleType()),
                     F.col("year").cast(T.IntegerType()),
                     F.md5(F.concat_ws("|", "song_id")).alias("songKey"))
             .where(F.col("title").isNotNull() &
                    F.col("artistName").isNotNull()))

In [0]:
(songs_raw.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable(f"{catalog_name}.{silver_schema}.dim_songs"))

## 2. Dimension: Artists

In [0]:
artists_raw = (spark.read.csv(f"{raw_path}/songs.csv", header=True)
                .select(F.col("artist_id").alias("artistId"),
                        F.col("artist_latitude").cast(T.DoubleType()).alias("latitude"),
                        F.col("artist_longitude").cast(T.DoubleType()).alias("longitude"),
                        F.col("artist_location").alias("location"),
                        F.regexp_replace(F.regexp_replace(F.col("artist_name"), '"', ''), '\\\\\\\\', '').alias("name"))
                .where(F.col("name").isNotNull())
                .groupBy("name")
                .agg(F.max("artistId").alias("artistId"),
                     F.max("latitude").alias("latitude"),
                     F.max("longitude").alias("longitude"),
                     F.max("location").alias("location"))
                .withColumn("artistKey", F.md5(F.col("artistId"))))

In [0]:
(artists_raw.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable(f"{catalog_name}.{silver_schema}.dim_artists"))

## 3. Dimension: DateTime

In [0]:
datetime_raw = (spark.sql("""
  WITH datetime_series AS (
    SELECT explode(sequence(to_timestamp('2018-10-01 00:00:00'),
                            to_timestamp('2025-03-31 23:59:59'),
                            interval 1 hour)) AS datetime
  )
  SELECT
    unix_timestamp(datetime) AS dateKey,
    datetime,
    dayofweek(datetime) AS dayOfWeek,
    dayofmonth(datetime) AS dayOfMonth,
    dayofyear(datetime) AS dayOfYear,
    month(datetime) AS month,
    year(datetime) AS year,
    CASE WHEN dayofweek(datetime) IN (6, 7) THEN True ELSE False END AS weekendFlag
  FROM datetime_series
  ORDER BY datetime"""))

In [0]:
(datetime_raw.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable(f"{catalog_name}.{silver_schema}.dim_datetime"))