In [0]:
%run "/Users/drmaiatauros@hotmail.com/01_Initialize_Setting"

In [0]:
from pyspark.sql.functions import col, from_json, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, TimestampType

In [0]:
# Kafka connection parameters
kafka_bootstrap_servers = "4.246.237.185:9092"
kafka_topic = "page_view_events_new"

# Unity Catalog paths
catalog_name = "music_streaming"
schema_name = "raw"
table_name = "page_view_events"

In [0]:
# Read stream from Kafka with increased timeouts and retry settings
kafka_stream_df = (spark.readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", kafka_bootstrap_servers)
  .option("subscribe", kafka_topic)
  .option("startingOffsets", "earliest")  # Use "latest" for production
  .option("failOnDataLoss", "false")
  # Add timeout settings
  .option("kafka.request.timeout.ms", "60000")
  .option("kafka.session.timeout.ms", "60000")
  .option("kafka.connection.max.idle.ms", "60000")
  .option("kafka.metadata.max.age.ms", "180000")
  # Add retry settings
  .option("kafka.retry.backoff.ms", "1000")
  .option("kafka.max.poll.interval.ms", "60000")
  .option("kafka.max.poll.records", "500")
  # Add security settings (if needed)
  .option("kafka.security.protocol", "PLAINTEXT")
  .load()
)

In [0]:
# Display schema of the Kafka stream
kafka_stream_df.printSchema()

In [0]:
# Define schema based on your event data structure
schema = StructType([
    StructField("ts", LongType(), True),
    StructField("sessionId", StringType(), True),
    StructField("userId", StringType(), True),
    StructField("auth", StringType(), True),
    StructField("level", StringType(), True),
    StructField("itemInSession", LongType(), True),
    StructField("page", StringType(), True),
    StructField("method", StringType(), True),
    StructField("status", LongType(), True),
    StructField("city", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("state", StringType(), True),
    StructField("userAgent", StringType(), True),
    StructField("lon", DoubleType(), True),
    StructField("lat", DoubleType(), True),
    StructField("firstName", StringType(), True),
    StructField("lastName", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("registration", LongType(), True)
])

In [0]:
# Parse the value column from Kafka which contains the JSON data
parsed_df = kafka_stream_df.select(
    col("timestamp").alias("kafka_timestamp"),
    col("topic").alias("kafka_topic"),
    col("partition").alias("kafka_partition"),
    col("offset").alias("kafka_offset"),
    from_json(col("value").cast("string"), schema).alias("data")
).select(
    "kafka_timestamp", 
    "kafka_topic", 
    "kafka_partition", 
    "kafka_offset", 
    "data.*"
)

In [0]:
# Add ingestion timestamp
parsed_df = parsed_df.withColumn("ingestion_time", current_timestamp())

In [0]:
# Display the parsed schema
parsed_df.printSchema()

In [0]:
# Full table path in Unity Catalog
table_path = f"{catalog_name}.{schema_name}.{table_name}"

In [0]:
# Write the stream to Delta table in Unity Catalog
stream_query = (parsed_df.writeStream
  .format("delta")
  .outputMode("append")
  .option("checkpointLocation", f'{checkpoint_path}page_view_events/')
  .partitionBy("page")  # Optional: partition by a field that makes sense for your data
  .trigger(processingTime="2 minutes")  # Process every 2 minutes
  .option("mergeSchema", "true")  # Allow schema evolution
  .toTable(table_path)
)

In [0]:
# Wait for the streaming query to terminate
stream_query.awaitTermination()

In [0]:
stream_query.stop()

In [0]:
%sql
SELECT * FROM music_streaming.raw.page_view_events;