In [0]:
%run "/Workspace/Users/samuel.barroscatarino@educ.sasserno.fr/musicstreamapp/databricks/01_Initialize_Setting"

In [0]:
# Import required libraries
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [0]:
def create_bronze_stream(table_name):
    # Reading streaming data from raw
    df_raw = spark.readStream.table(f"{catalog_name}.{raw_schema}.{table_name}")
    
    # Cleaning the data and adding columns
    df_cleaned = (df_raw.withColumn("timestamp", F.from_unixtime(F.col("ts")/1000)
                                    .cast(T.TimestampType()))
                    .withColumn("registration_time", F.from_unixtime(F.col("registration")/1000)
                                .cast(T.TimestampType()))
                    .withColumn("latitude", F.col("lat"))
                    .withColumn("longitude", F.col("lon"))
                    .withColumn("ingestion_date", F.current_date())
                    .withColumn("bronze_id", F.expr("uuid()")))

    # Create the stream
    return df_cleaned

In [0]:
def process_bronze_stream(table_name):
    # Create Bronze stream
    bronze_stream_df = create_bronze_stream(table_name)

    # Write to Unity Catalog table
    return (bronze_stream_df.writeStream
             .format("delta")
             .outputMode("append")
             .queryName(f"Streaming_Raw_to_Bronze_{table_name}")
             .trigger(availableNow=True)
             .option("checkpointLocation", f"{checkpoint_path}/{bronze_schema}/{table_name}/")
             .option("mergeSchema", "true")
             .toTable(f"{catalog_name}.{bronze_schema}.{table_name}")
    )

In [0]:
# Start processing all bronze streams
queries = []

for table_name in list_tables:
    print(f"Starting bronze stream processing for {table_name}...")
    query = process_bronze_stream(table_name)
    queries.append(query)
    print(f"Stream processing started for {table_name}")

In [0]:
# Monitor each query
for i, (table_name) in enumerate(list_tables):
    print(f"\nStatus for {table_name}:")
    print(queries[i].status)

In [0]:
# Example queries for each table
for table_name in list_tables:
    print(f"\nSample data from {table_name}:")
    display(spark.sql(f"SELECT * FROM {catalog_name}.{bronze_schema}.{table_name} LIMIT 5"))