In [0]:
%run "/Workspace/Users/samuel.barroscatarino@educ.sasserno.fr/musicstreamapp/databricks/01_Initialize_Setting"

## 1. Fact_Streams Table

In [0]:
listen_events_df = (spark.readStream.table(f"{catalog_name}.{bronze_schema}.listen_events")
                    .select("userId", "song", "timestamp", "city", "state", "latitude", "longitude",
                            F.regexp_replace(F.regexp_replace(F.col("artist"), '"', ''), '\\\\\\\\', '').alias("artist_clean"))
                    .filter(F.col("userId").isNotNull()))

In [0]:
dim_users_df = spark.read.table(f"{catalog_name}.{silver_schema}.dim_users")
dim_songs_df = spark.read.table(f"{catalog_name}.{silver_schema}.dim_songs")
dim_artists_df = spark.read.table(f"{catalog_name}.{silver_schema}.dim_artists")
dim_locations_df = spark.read.table(f"{catalog_name}.{silver_schema}.dim_location")
dim_datetime_df = spark.read.table(f"{catalog_name}.{silver_schema}.dim_datetime")

In [0]:
fact_stream = (listen_events_df
               # Join with dim_users (SCD Type 2)
               .join(dim_users_df, 
                     (listen_events_df.userId == dim_users_df.userId)
                        & (F.to_date(listen_events_df.timestamp) >= dim_users_df.row_activation_date)
                        & (F.to_date(listen_events_df.timestamp) < dim_users_df.row_expiration_date), 
                      "left")
               # Join with dim_artists
               .join(dim_artists_df, listen_events_df.artist_clean == dim_artists_df.name, "left")
               # Join with dim_songs
               .join(dim_songs_df, 
                     (listen_events_df.song == dim_songs_df.title)
                     & (listen_events_df.artist_clean == dim_songs_df.artistName), 
                     "left")
               # Join with dim_location
               .join(dim_locations_df,
                     (listen_events_df.city == dim_locations_df.city)
                     & (listen_events_df.state == dim_locations_df.stateCode)
                     & (listen_events_df.latitude == dim_locations_df.latitude)
                     & (listen_events_df.longitude == dim_locations_df.longitude),
                     "left")
               # Join with dim_datetime
               .join(dim_datetime_df, F.date_trunc("hour", listen_events_df.timestamp) == dim_datetime_df.datetime, "left")
               .select("userKey", "artistKey", "songKey", "dateKey", "locationKey", "timestamp",
                       F.to_date(F.col("timestamp")).alias("date_part")))

In [0]:
(fact_stream.writeStream
 .format("delta")
 .outputMode("append")
 .option("checkpointLocation", f"{checkpoint_path}/{gold_schema}/fact_stream/")
 .partitionBy("date_part")
 .trigger(availableNow=True)
 .toTable(f"{catalog_name}.{gold_schema}.fact_stream"))

In [0]:
%sql
SELECT * FROM music_streaming.gold.fact_stream LIMIT 5;

## 2. Wide_Fact View

In [0]:
%sql
CREATE OR REPLACE VIEW music_streaming.gold.wide_fact
AS
SELECT 
f.userKey,
            f.artistKey,
            f.songKey,
            f.dateKey,
            f.locationKey,
            f.timestamp,
            u.firstName,
            u.lastName,
            u.gender,
            u.level,
            u.userId,
            s.duration as songDuration,
            s.title as songName,
            l.city,
            l.stateCode as state,
            l.latitude,
            l.longitude,
            d.datetime as dateHour,
            d.dayOfMonth,
            d.dayOfWeek,
            a.name as artistName
FROM music_streaming.gold.fact_stream f
INNER JOIN music_streaming.silver.dim_users u ON f.userKey = u.userKey
INNER JOIN music_streaming.silver.dim_songs s ON f.songKey = s.songKey
INNER JOIN music_streaming.silver.dim_artists a ON f.artistKey = a.artistKey
INNER JOIN music_streaming.silver.dim_location l ON f.locationKey = l.locationKey
INNER JOIN music_streaming.silver.dim_datetime d ON f.dateKey = d.dateKey

In [0]:
%sql
SELECT * FROM music_streaming.gold.wide_fact LIMIT 10;