In [None]:
# ============================================================================
# NETFLIX STREAMING PIPELINE - REAL-TIME TRENDING CONTENT ANALYSIS
# ============================================================================
# This notebook processes Netflix viewing events in real-time and tracks
# trending content using 5-minute windows. Results are written to BigQuery
# for visualization in Looker Studio.
#
# Based on: Lab9 patterns (file streaming, windowed aggregations, BigQuery sink)
# ============================================================================

# ----------------------------------------------------------------------------
# CONFIGURATION SECTION
# ----------------------------------------------------------------------------

# GCP Settings
GCP_PROJECT_ID = "agile-producer-471907-s7"
GCS_BUCKET = "data_netflix_2025"
TEMP_GCS_BUCKET = "temp_netflix_2025"

# Data Paths
STREAMING_DATA_PATH = f"gs://{GCS_BUCKET}/streaming"
MOVIES_CATALOG_PATH = f"gs://{GCS_BUCKET}/raw/movies.csv"

# BigQuery Settings
BQ_DATASET = "netflix_streaming"
BQ_OUTPUT_TABLE = "trending_content_realtime"
BQ_TABLE_FULL = f"{GCP_PROJECT_ID}.{BQ_DATASET}.{BQ_OUTPUT_TABLE}"

# Spark Settings
SPARK_MASTER = "spark://spark-master:7077"
DRIVER_MEMORY = "2g"
EXECUTOR_MEMORY = "2g"
EXECUTOR_CORES = "1"

# Streaming Settings
MAX_FILES_PER_TRIGGER = 1
WINDOW_DURATION = "5 minutes"
TRIGGER_INTERVAL = "10 seconds"
WATERMARK_DELAY = "10 minutes"

# Display Settings
DISPLAY_ITERATIONS = 30
DISPLAY_INTERVAL_SECONDS = 10
TOP_N_RESULTS = 10

print("Configuration loaded successfully")
print(f"Output destination: {BQ_TABLE_FULL}")
print(f"Streaming source: {STREAMING_DATA_PATH}")

In [None]:
# ----------------------------------------------------------------------------
# CELL 1: SPARK SESSION SETUP
# ----------------------------------------------------------------------------

from pyspark import SparkConf
from pyspark.sql import SparkSession

print("=" * 80)
print("INITIALIZING SPARK SESSION")
print("=" * 80)

# Configure Spark
sparkConf = SparkConf()
sparkConf.setMaster(SPARK_MASTER)
sparkConf.setAppName("NetflixStreamingPipeline")
sparkConf.set("spark.driver.memory", DRIVER_MEMORY)
sparkConf.set("spark.executor.memory", EXECUTOR_MEMORY)
sparkConf.set("spark.executor.cores", EXECUTOR_CORES)

# Create Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Configure GCS access
spark.conf.set('temporaryGcsBucket', TEMP_GCS_BUCKET)
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print(f"Spark session created: {spark.version}")
print(f"Application ID: {spark.sparkContext.applicationId}")
print("GCS access configured")
print("=" * 80)

In [None]:
# ----------------------------------------------------------------------------
# CELL 2: SCHEMA DEFINITION & STATIC DATA
# ----------------------------------------------------------------------------

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType

print("=" * 80)
print("SCHEMA DEFINITION & STATIC DATA LOADING")
print("=" * 80)

# Define schema for streaming watch events
watch_events_schema = StructType([
    StructField("session_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("movie_id", StringType(), True),
    StructField("watch_date", StringType(), True),
    StructField("device_type", StringType(), True),
    StructField("watch_duration_minutes", DoubleType(), True),
    StructField("progress_percentage", DoubleType(), True),
    StructField("action", StringType(), True),
    StructField("quality", StringType(), True),
    StructField("location_country", StringType(), True),
    StructField("is_download", BooleanType(), True),
    StructField("user_rating", DoubleType(), True),
    StructField("timestamp", StringType(), True)
])

print("Watch events schema defined (13 fields)")

# Load movies catalog as static DataFrame
print(f"\nLoading movies catalog from: {MOVIES_CATALOG_PATH}")
movies_static = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(MOVIES_CATALOG_PATH)

movies_count = movies_static.count()
print(f"Loaded {movies_count} movies")

# Select and cache relevant movie attributes
movies_static = movies_static.select(
    "movie_id",
    "title",
    "content_type",
    "genre_primary",
    "release_year"
).cache()

print("Movies catalog cached for streaming joins")
print("Sample movies:")
movies_static.show(3, truncate=False)
print("=" * 80)

In [None]:
# ----------------------------------------------------------------------------
# CELL 3: STREAMING QUERY - TRENDING CONTENT WITH BIGQUERY OUTPUT
# ----------------------------------------------------------------------------

from pyspark.sql.functions import (
    window, col, count, avg, sum as spark_sum,
    approx_count_distinct, to_timestamp, desc
)

print("=" * 80)
print("CONFIGURING STREAMING QUERY: TRENDING CONTENT")
print("=" * 80)

# Read streaming data from GCS
print(f"Setting up streaming source: {STREAMING_DATA_PATH}")
stream_df = spark.readStream \
    .schema(watch_events_schema) \
    .option("maxFilesPerTrigger", MAX_FILES_PER_TRIGGER) \
    .json(STREAMING_DATA_PATH)

print(f"Streaming source configured (maxFilesPerTrigger={MAX_FILES_PER_TRIGGER})")

# Convert timestamp string to TimestampType
stream_df = stream_df.withColumn(
    "event_time",
    to_timestamp(col("timestamp"), "yyyy-MM-dd HH:mm:ss")
)

# Join with static movies catalog
enriched_stream = stream_df.join(movies_static, "movie_id", "left")
print("Stream enriched with movies catalog (static join)")

# Add watermark for handling late data
enriched_stream = enriched_stream.withWatermark("event_time", WATERMARK_DELAY)
print(f"Watermark configured: {WATERMARK_DELAY}")

# Create windowed aggregation: Trending content
print(f"\nBuilding aggregation: {WINDOW_DURATION} windows")
trending_content = enriched_stream \
    .groupBy(
        window(col("event_time"), WINDOW_DURATION),
        "movie_id",
        "title",
        "genre_primary",
        "content_type"
    ) \
    .agg(
        count("*").alias("view_count"),
        approx_count_distinct("user_id").alias("unique_viewers"),
        avg("progress_percentage").alias("avg_completion_pct"),
        spark_sum("watch_duration_minutes").alias("total_watch_time_min"),
        count(col("action") == "completed").alias("completed_views")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        "movie_id",
        "title",
        "genre_primary",
        "content_type",
        "view_count",
        "unique_viewers",
        "avg_completion_pct",
        "total_watch_time_min",
        "completed_views"
    ) \
    .orderBy(desc("view_count"))

print("Aggregation built: view_count, unique_viewers, avg_completion, total_watch_time")

# Define BigQuery write function
def write_to_bigquery(batch_df, batch_id):
    """
    Write each micro-batch to BigQuery.
    This function is called for each streaming batch.
    """
    print(f"Writing batch {batch_id} to BigQuery...")
    if batch_df.count() > 0:
        batch_df.write.format('bigquery') \
            .option('table', BQ_TABLE_FULL) \
            .mode("overwrite") \
            .save()
        print(f"Batch {batch_id} written successfully ({batch_df.count()} rows)")
    else:
        print(f"Batch {batch_id} is empty, skipping write")

# Start streaming query with memory sink (for monitoring)
print("\nStarting streaming query: Memory sink (for live display)")
query_memory = trending_content \
    .writeStream \
    .queryName("trending_content_memory") \
    .format("memory") \
    .outputMode("complete") \
    .start()

print(f"Memory sink started: {query_memory.id}")

# Start streaming query with BigQuery sink (for Looker Studio)
print("\nStarting streaming query: BigQuery sink (for dashboard)")
query_bigquery = trending_content \
    .writeStream \
    .outputMode("complete") \
    .trigger(processingTime=TRIGGER_INTERVAL) \
    .foreachBatch(write_to_bigquery) \
    .start()

print(f"BigQuery sink started: {query_bigquery.id}")
print(f"Trigger interval: {TRIGGER_INTERVAL}")
print(f"Output table: {BQ_TABLE_FULL}")

print("\n" + "=" * 80)
print("STREAMING QUERIES ACTIVE")
print("=" * 80)
print("Two parallel sinks:")
print("  1. Memory table 'trending_content_memory' (for live monitoring below)")
print("  2. BigQuery table (for Looker Studio dashboard)")
print("=" * 80)

In [None]:
# ----------------------------------------------------------------------------
# CELL 4: LIVE MONITORING DISPLAY
# ----------------------------------------------------------------------------

from time import sleep
from datetime import datetime

print("=" * 80)
print("REAL-TIME MONITORING DASHBOARD")
print("=" * 80)
print(f"Displaying top {TOP_N_RESULTS} trending content")
print(f"Update interval: {DISPLAY_INTERVAL_SECONDS} seconds")
print(f"Total iterations: {DISPLAY_ITERATIONS}")
print("Press 'Interrupt Kernel' to stop")
print("=" * 80)

try:
    for iteration in range(DISPLAY_ITERATIONS):
        print(f"\n{'=' * 80}")
        print(f"UPDATE #{iteration + 1} | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"{'=' * 80}")
        
        # Query trending content from memory table
        result_df = spark.sql(f"""
            SELECT 
                window_start,
                window_end,
                title,
                genre_primary,
                view_count,
                unique_viewers,
                ROUND(avg_completion_pct, 1) as completion_pct,
                ROUND(total_watch_time_min, 1) as watch_time_min
            FROM trending_content_memory
            ORDER BY view_count DESC
            LIMIT {TOP_N_RESULTS}
        """)
        
        result_df.show(TOP_N_RESULTS, truncate=False)
        
        # Display query status
        print(f"\nQuery Status:")
        print(f"  Memory sink active: {query_memory.isActive}")
        print(f"  BigQuery sink active: {query_bigquery.isActive}")
        
        if query_memory.recentProgress:
            recent = query_memory.recentProgress[-1]
            print(f"  Recent batch: {recent.get('numInputRows', 0)} rows processed")
            print(f"  Total batches: {len(query_memory.recentProgress)}")
        
        print(f"\n{'=' * 80}")
        print(f"Next update in {DISPLAY_INTERVAL_SECONDS} seconds... ({iteration + 1}/{DISPLAY_ITERATIONS})")
        print(f"{'=' * 80}")
        
        sleep(DISPLAY_INTERVAL_SECONDS)
        
except KeyboardInterrupt:
    print("\n" + "=" * 80)
    print("STOPPING STREAMING PIPELINE")
    print("=" * 80)
    print("User interrupted. Stopping queries gracefully...")
    
    print("Stopping memory sink...")
    query_memory.stop()
    print("Memory sink stopped")
    
    print("Stopping BigQuery sink...")
    query_bigquery.stop()
    print("BigQuery sink stopped")
    
    print("\nAll streaming queries stopped successfully")
    print("Note: Spark session is still active")
    
except Exception as e:
    print(f"\nERROR: {str(e)}")
    print("Stopping queries due to error...")
    try:
        query_memory.stop()
        query_bigquery.stop()
    except:
        pass
    print("Queries stopped")

print("\n" + "=" * 80)
print("MONITORING COMPLETE")
print("=" * 80)
print(f"\nBigQuery table available for Looker Studio: {BQ_TABLE_FULL}")
print("You can now create visualizations in Looker Studio")

In [None]:
# ----------------------------------------------------------------------------
# CELL 5: CLEANUP - STOP SPARK SESSION
# ----------------------------------------------------------------------------

print("=" * 80)
print("CLEANUP: STOPPING SPARK SESSION")
print("=" * 80)

spark.stop()

print("Spark session stopped successfully")
print("\nPipeline Summary:")
print(f"  - Processed streaming events from: {STREAMING_DATA_PATH}")
print(f"  - Output written to: {BQ_TABLE_FULL}")
print(f"  - Ready for Looker Studio dashboard creation")
print("=" * 80)