# Netflix Real-Time Streaming Pipeline

This notebook implements a Spark Structured Streaming pipeline to analyze Netflix viewing events in real-time.

## Architecture
- **Data Source**: watch_history.csv with synthetic timestamps
- **Streaming Engine**: Spark Structured Streaming (file-based)
- **Analytics**: Windowed aggregations (5-min and 10-min windows)
- **Output**: In-memory tables queryable with SQL

## Cells Overview
1. **Data Preparation**: Load watch_history, add timestamps, save as JSON files
2. **Spark Setup**: Configure SparkSession
3. **Schema & Static Data**: Define schema, load movies catalog
4. **Trending Content**: 5-minute windowed aggregations
5. **User Engagement**: 10-minute windowed aggregations
6. **Action Monitoring**: Track user actions (play, pause, stop, complete)
7. **Live Display**: Query and display results every 10 seconds


In [None]:
# Cell 1: Data Preparation - Add Timestamps and Create JSON Files for Streaming

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import os
import shutil

print("="*80)
print("NETFLIX STREAMING PIPELINE - DATA PREPARATION")
print("="*80)

# Configuration
GCS_BUCKET = "gs://data_netflix_2025/raw"
LOCAL_DATA_PATH = "/home/jovyan/data"
WATCH_HISTORY_FILE = f"{GCS_BUCKET}/watch_history.csv"
OUTPUT_STREAM_DIR = f"{LOCAL_DATA_PATH}/netflix_stream"
EVENTS_PER_FILE = 150  # Number of events per JSON file

print(f"\nüì• Loading watch_history from: {WATCH_HISTORY_FILE}")

# Load watch history data
df = pd.read_csv(WATCH_HISTORY_FILE)
print(f"‚úÖ Loaded {len(df)} watch events")
print(f"   Columns: {list(df.columns)}")

# Display sample of original data
print("\nüìä Sample of original data (first 5 rows):")
print(df.head())

print("\n‚è∞ Adding realistic timestamps to watch_date...")

# Function to generate realistic hour distribution (peak evening hours)
def generate_realistic_hour():
    """Generate hour with peak probability in evening (18:00-23:00)"""
    # Define hour probabilities (higher for evening)
    hours = list(range(24))
    probabilities = [
        0.01, 0.01, 0.01, 0.01, 0.01, 0.02,  # 0-5 (early morning, low)
        0.02, 0.03, 0.03, 0.03, 0.03, 0.04,  # 6-11 (morning, low-medium)
        0.04, 0.04, 0.04, 0.04, 0.05, 0.05,  # 12-17 (afternoon, medium)
        0.08, 0.09, 0.10, 0.10, 0.08, 0.06   # 18-23 (evening, peak)
    ]
    return np.random.choice(hours, p=probabilities)

# Add timestamp components
np.random.seed(42)  # For reproducibility
df['hour'] = [generate_realistic_hour() for _ in range(len(df))]
df['minute'] = np.random.randint(0, 60, len(df))
df['second'] = np.random.randint(0, 60, len(df))

# Create full timestamp
df['timestamp'] = pd.to_datetime(df['watch_date']) + \
                  pd.to_timedelta(df['hour'], unit='h') + \
                  pd.to_timedelta(df['minute'], unit='m') + \
                  pd.to_timedelta(df['second'], unit='s')

# Sort by timestamp to create proper event stream
df = df.sort_values('timestamp').reset_index(drop=True)

print(f"‚úÖ Added timestamps. Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# Drop temporary columns, keep original watch_date for reference
df = df.drop(['hour', 'minute', 'second'], axis=1)

# Convert timestamp to string format for JSON
df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

print("\nüìä Sample data with timestamps:")
print(df[['session_id', 'user_id', 'movie_id', 'timestamp', 'action']].head(10))

# Create output directory (remove if exists to start fresh)
if os.path.exists(OUTPUT_STREAM_DIR):
    print(f"\nüóëÔ∏è  Removing existing directory: {OUTPUT_STREAM_DIR}")
    shutil.rmtree(OUTPUT_STREAM_DIR)

os.makedirs(OUTPUT_STREAM_DIR, exist_ok=True)
print(f"‚úÖ Created output directory: {OUTPUT_STREAM_DIR}")

# Split data into multiple JSON files for streaming simulation
print(f"\nüìù Creating JSON files ({EVENTS_PER_FILE} events per file)...")

num_files = 0
for i in range(0, len(df), EVENTS_PER_FILE):
    batch = df.iloc[i:i+EVENTS_PER_FILE]
    
    # Convert batch to JSON records
    records = batch.to_dict(orient='records')
    
    # Write to JSON file
    file_path = f"{OUTPUT_STREAM_DIR}/part-{num_files:05d}.json"
    with open(file_path, 'w') as f:
        for record in records:
            f.write(json.dumps(record) + '\n')
    
    num_files += 1
    
    if num_files % 100 == 0:
        print(f"   Created {num_files} files...")

print(f"\n‚úÖ Created {num_files} JSON files in {OUTPUT_STREAM_DIR}")
print(f"   Total events: {len(df)}")
print(f"   Average events per file: {len(df) / num_files:.1f}")

print("\n" + "="*80)
print("‚úÖ DATA PREPARATION COMPLETE!")
print("="*80)
print(f"\nStreaming source directory: {OUTPUT_STREAM_DIR}")
print(f"Ready for Spark Structured Streaming with maxFilesPerTrigger=1")
print("\nNext: Run Cell 2 to configure Spark and start streaming queries.")


In [None]:
# Cell 2: Spark Session Configuration

from pyspark import SparkConf
from pyspark.sql import SparkSession

print("="*80)
print("SPARK SESSION CONFIGURATION")
print("="*80)

# Configure Spark
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("NetflixStreamingPipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

print("\n‚öôÔ∏è  Spark Configuration:")
print(f"   Master: spark://spark-master:7077")
print(f"   App Name: NetflixStreamingPipeline")
print(f"   Driver Memory: 2g")
print(f"   Executor Memory: 2g")

# Create Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

print("\n‚úÖ Spark session created successfully!")
print(f"   Spark Version: {spark.version}")
print(f"   Session ID: {spark.sparkContext.applicationId}")

print("\n" + "="*80)
print("Ready to define schemas and load data!")
print("="*80)


In [None]:
# Cell 3: Schema Definition & Load Static Data

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, IntegerType, TimestampType
from pyspark.sql.functions import col, to_timestamp

print("="*80)
print("SCHEMA DEFINITION & STATIC DATA LOADING")
print("="*80)

# Define schema for watch history streaming data
print("\nüìã Defining schema for watch history events...")

dataSchema = StructType([
    StructField("session_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("movie_id", StringType(), True),
    StructField("watch_date", StringType(), True),
    StructField("device_type", StringType(), True),
    StructField("watch_duration_minutes", DoubleType(), True),
    StructField("progress_percentage", DoubleType(), True),
    StructField("action", StringType(), True),
    StructField("quality", StringType(), True),
    StructField("location_country", StringType(), True),
    StructField("is_download", BooleanType(), True),
    StructField("user_rating", DoubleType(), True),
    StructField("timestamp", StringType(), True)  # Will be converted to TimestampType
])

print("‚úÖ Schema defined with 13 fields")
print("   Key fields: session_id, user_id, movie_id, timestamp, action")

# Load movies catalog as static DataFrame for enrichment
print("\nüì• Loading movies catalog (static data)...")

GCS_BUCKET = "gs://data_netflix_2025/raw"
MOVIES_FILE = f"{GCS_BUCKET}/movies.csv"

movies_static = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(MOVIES_FILE)

movies_count = movies_static.count()
print(f"‚úÖ Loaded {movies_count} movies from catalog")

# Select relevant columns for enrichment
movies_static = movies_static.select(
    "movie_id",
    "title",
    "content_type",
    "genre_primary",
    "genre_secondary",
    "release_year",
    "duration_minutes",
    "rating",
    "imdb_rating"
)

print("\nüìä Movies catalog schema:")
movies_static.printSchema()

print("\nüìä Sample movies:")
movies_static.show(5, truncate=False)

# Cache the movies dataframe for better performance in joins
movies_static.cache()
print("\n‚úÖ Movies catalog cached for streaming joins")

print("\n" + "="*80)
print("Schema and static data ready!")
print("="*80)
print("\nNext: Create streaming queries with windowed aggregations")


In [None]:
# Cell 4: Streaming Query - Trending Content (5-minute Windows)

from pyspark.sql.functions import window, count, avg, sum as spark_sum, desc

print("="*80)
print("STREAMING QUERY 1: TRENDING CONTENT")
print("="*80)

# Configuration
STREAM_INPUT_DIR = "/home/jovyan/data/netflix_stream"
MAX_FILES_PER_TRIGGER = 1

print(f"\nüì° Setting up streaming source...")
print(f"   Input directory: {STREAM_INPUT_DIR}")
print(f"   Max files per trigger: {MAX_FILES_PER_TRIGGER}")

# Read streaming data
sdf = spark.readStream \
    .schema(dataSchema) \
    .option("maxFilesPerTrigger", MAX_FILES_PER_TRIGGER) \
    .json(STREAM_INPUT_DIR)

print("‚úÖ Streaming source configured")

# Convert timestamp string to TimestampType
sdf = sdf.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd HH:MM:ss"))

print("\nüîó Joining streaming data with movies catalog...")

# Join with static movies data for enrichment
enriched = sdf.join(movies_static, "movie_id", "left")

print("‚úÖ Stream enriched with movie metadata")

# Create windowed aggregation for trending content
print("\nüìä Creating 5-minute windowed aggregation for trending content...")

trending = enriched \
    .groupBy(
        window(col("timestamp"), "5 minutes"),
        "movie_id",
        "title",
        "genre_primary"
    ) \
    .agg(
        count("*").alias("view_count"),
        avg("progress_percentage").alias("avg_completion"),
        count(col("user_id")).alias("total_sessions")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        "movie_id",
        "title",
        "genre_primary",
        "view_count",
        "avg_completion",
        "total_sessions"
    ) \
    .orderBy(col("view_count").desc())

print("‚úÖ Trending content aggregation defined")

# Write stream to memory sink for querying
print("\nüöÄ Starting streaming query (trending_content)...")

query_trending = trending \
    .writeStream \
    .queryName("trending_content") \
    .format("memory") \
    .outputMode("complete") \
    .start()

print("‚úÖ Streaming query 'trending_content' started successfully!")
print(f"   Query ID: {query_trending.id}")
print(f"   Status: {query_trending.status}")

print("\n" + "="*80)
print("Trending content query running!")
print("="*80)
print("\nAnalytics:")
print("  - 5-minute tumbling windows")
print("  - Top movies by view count")
print("  - Average completion percentage")
print("  - Total viewing sessions per movie")
print("\nQuery results available in 'trending_content' table")


In [None]:
# Cell 5: Streaming Query - User Engagement (10-minute Windows)

from pyspark.sql.functions import countDistinct

print("="*80)
print("STREAMING QUERY 2: USER ENGAGEMENT")
print("="*80)

print("\nüìä Creating 10-minute windowed aggregation for user engagement...")

# User engagement metrics: active users, watch time, session duration
engagement = enriched \
    .groupBy(
        window(col("timestamp"), "10 minutes"),
        "device_type",
        "location_country"
    ) \
    .agg(
        countDistinct("user_id").alias("active_users"),
        spark_sum("watch_duration_minutes").alias("total_watch_time"),
        avg("watch_duration_minutes").alias("avg_session_duration"),
        count("*").alias("total_sessions")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        "device_type",
        "location_country",
        "active_users",
        "total_watch_time",
        "avg_session_duration",
        "total_sessions"
    ) \
    .orderBy(col("active_users").desc())

print("‚úÖ User engagement aggregation defined")

# Write stream to memory sink
print("\nüöÄ Starting streaming query (user_engagement)...")

query_engagement = engagement \
    .writeStream \
    .queryName("user_engagement") \
    .format("memory") \
    .outputMode("complete") \
    .start()

print("‚úÖ Streaming query 'user_engagement' started successfully!")
print(f"   Query ID: {query_engagement.id}")
print(f"   Status: {query_engagement.status}")

print("\n" + "="*80)
print("User engagement query running!")
print("="*80)
print("\nAnalytics:")
print("  - 10-minute tumbling windows")
print("  - Active users by device and country")
print("  - Total watch time per segment")
print("  - Average session duration")
print("  - Total viewing sessions")
print("\nQuery results available in 'user_engagement' table")


In [None]:
# Cell 6: Streaming Query - Action Monitoring

print("="*80)
print("STREAMING QUERY 3: ACTION MONITORING")
print("="*80)

print("\nüìä Creating 5-minute windowed aggregation for action monitoring...")

# Action monitoring: track started, paused, stopped, completed events
actions = enriched \
    .groupBy(
        window(col("timestamp"), "5 minutes"),
        "action"
    ) \
    .agg(
        count("*").alias("action_count"),
        countDistinct("user_id").alias("unique_users"),
        avg("progress_percentage").alias("avg_progress")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        "action",
        "action_count",
        "unique_users",
        "avg_progress"
    ) \
    .orderBy(col("action_count").desc())

print("‚úÖ Action monitoring aggregation defined")

# Write stream to memory sink
print("\nüöÄ Starting streaming query (action_monitoring)...")

query_actions = actions \
    .writeStream \
    .queryName("action_monitoring") \
    .format("memory") \
    .outputMode("complete") \
    .start()

print("‚úÖ Streaming query 'action_monitoring' started successfully!")
print(f"   Query ID: {query_actions.id}")
print(f"   Status: {query_actions.status}")

print("\n" + "="*80)
print("Action monitoring query running!")
print("="*80)
print("\nAnalytics:")
print("  - 5-minute tumbling windows")
print("  - Count of each action type (started, paused, stopped, completed)")
print("  - Unique users performing each action")
print("  - Average progress percentage per action")
print("\nQuery results available in 'action_monitoring' table")

# Additional: Genre Performance Query
print("\n" + "="*80)
print("STREAMING QUERY 4: GENRE PERFORMANCE")
print("="*80)

print("\nüìä Creating 5-minute windowed aggregation for genre performance...")

genre_performance = enriched \
    .groupBy(
        window(col("timestamp"), "5 minutes"),
        "genre_primary"
    ) \
    .agg(
        count("*").alias("view_count"),
        countDistinct("user_id").alias("unique_viewers"),
        avg("progress_percentage").alias("avg_completion"),
        spark_sum("watch_duration_minutes").alias("total_watch_time")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        "genre_primary",
        "view_count",
        "unique_viewers",
        "avg_completion",
        "total_watch_time"
    ) \
    .orderBy(col("view_count").desc())

print("‚úÖ Genre performance aggregation defined")

print("\nüöÄ Starting streaming query (genre_performance)...")

query_genre = genre_performance \
    .writeStream \
    .queryName("genre_performance") \
    .format("memory") \
    .outputMode("complete") \
    .start()

print("‚úÖ Streaming query 'genre_performance' started successfully!")
print(f"   Query ID: {query_genre.id}")
print(f"   Status: {query_genre.status}")

print("\n" + "="*80)
print("All streaming queries running!")
print("="*80)
print("\nActive Queries:")
print("  1. trending_content (5-min windows)")
print("  2. user_engagement (10-min windows)")
print("  3. action_monitoring (5-min windows)")
print("  4. genre_performance (5-min windows)")
print("\nNext: Run Cell 7 to display live results")


In [None]:
# Cell 7: Display Real-Time Results

from time import sleep
from datetime import datetime

print("="*80)
print("REAL-TIME STREAMING DASHBOARD")
print("="*80)
print("\nStarting live display loop...")
print("Press Ctrl+C (or interrupt kernel) to stop")
print("\nUpdates every 10 seconds")
print("="*80)

try:
    for i in range(50):  # Run for ~8 minutes (50 iterations * 10 seconds)
        print("\n\n")
        print("‚ñà" * 80)
        print(f"UPDATE #{i+1} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("‚ñà" * 80)
        
        # 1. Trending Content
        print("\nüìä TOP 10 TRENDING CONTENT (Last 5 minutes):")
        print("-" * 80)
        trending_df = spark.sql("""
            SELECT 
                window_start,
                window_end,
                movie_id,
                title,
                genre_primary,
                view_count,
                ROUND(avg_completion, 2) as avg_completion_pct,
                total_sessions
            FROM trending_content
            ORDER BY view_count DESC
            LIMIT 10
        """)
        trending_df.show(10, truncate=False)
        
        # 2. User Engagement
        print("\nüë• USER ENGAGEMENT BY DEVICE & COUNTRY (Last 10 minutes):")
        print("-" * 80)
        engagement_df = spark.sql("""
            SELECT 
                window_start,
                window_end,
                device_type,
                location_country,
                active_users,
                ROUND(total_watch_time, 2) as total_watch_time_min,
                ROUND(avg_session_duration, 2) as avg_session_min,
                total_sessions
            FROM user_engagement
            ORDER BY active_users DESC
            LIMIT 10
        """)
        engagement_df.show(10, truncate=False)
        
        # 3. Action Monitoring
        print("\nüì± ACTION MONITORING (Last 5 minutes):")
        print("-" * 80)
        actions_df = spark.sql("""
            SELECT 
                window_start,
                window_end,
                action,
                action_count,
                unique_users,
                ROUND(avg_progress, 2) as avg_progress_pct
            FROM action_monitoring
            ORDER BY action_count DESC
        """)
        actions_df.show(truncate=False)
        
        # 4. Genre Performance
        print("\nüé¨ TOP GENRES (Last 5 minutes):")
        print("-" * 80)
        genre_df = spark.sql("""
            SELECT 
                window_start,
                window_end,
                genre_primary,
                view_count,
                unique_viewers,
                ROUND(avg_completion, 2) as avg_completion_pct,
                ROUND(total_watch_time, 2) as total_watch_time_min
            FROM genre_performance
            ORDER BY view_count DESC
            LIMIT 10
        """)
        genre_df.show(10, truncate=False)
        
        # Query Status
        print("\nüîÑ STREAMING QUERY STATUS:")
        print("-" * 80)
        print(f"  trending_content:   {query_trending.status['isDataAvailable']} | Recent progress: {query_trending.recentProgress[-1]['numInputRows'] if query_trending.recentProgress else 0} rows")
        print(f"  user_engagement:    {query_engagement.status['isDataAvailable']} | Recent progress: {query_engagement.recentProgress[-1]['numInputRows'] if query_engagement.recentProgress else 0} rows")
        print(f"  action_monitoring:  {query_actions.status['isDataAvailable']} | Recent progress: {query_actions.recentProgress[-1]['numInputRows'] if query_actions.recentProgress else 0} rows")
        print(f"  genre_performance:  {query_genre.status['isDataAvailable']} | Recent progress: {query_genre.recentProgress[-1]['numInputRows'] if query_genre.recentProgress else 0} rows")
        
        print("\n" + "=" * 80)
        print(f"Next update in 10 seconds... (Iteration {i+1}/50)")
        print("=" * 80)
        
        sleep(10)
        
except KeyboardInterrupt:
    print("\n\n" + "üõë" * 40)
    print("STOPPING STREAMING PIPELINE")
    print("üõë" * 40)
    print("\nStopping all queries gracefully...")
    
    query_trending.stop()
    print("‚úÖ Stopped: trending_content")
    
    query_engagement.stop()
    print("‚úÖ Stopped: user_engagement")
    
    query_actions.stop()
    print("‚úÖ Stopped: action_monitoring")
    
    query_genre.stop()
    print("‚úÖ Stopped: genre_performance")
    
    print("\n‚úÖ All streaming queries stopped successfully!")
    print("\nNote: Spark session is still active. Run the next cell to stop it completely.")
    
except Exception as e:
    print(f"\n‚ùå Error occurred: {str(e)}")
    print("\nStopping queries...")
    
    try:
        query_trending.stop()
        query_engagement.stop()
        query_actions.stop()
        query_genre.stop()
    except:
        pass
    
    print("Queries stopped due to error")

print("\n" + "="*80)
print("Display loop completed or interrupted")
print("="*80)


In [None]:
# Cell 8: Stop Spark Session (Run after stopping queries)

print("="*80)
print("CLEANUP: STOPPING SPARK SESSION")
print("="*80)

# Stop Spark context
spark.stop()

print("\n‚úÖ Spark session stopped successfully!")
print("\n" + "="*80)
print("Netflix Streaming Pipeline Complete!")
print("="*80)
print("\nSummary:")
print("  - Data preparation: ‚úÖ")
print("  - Streaming queries: ‚úÖ")
print("  - Real-time analytics: ‚úÖ")
print("  - Cleanup: ‚úÖ")
print("\nThank you for using the Netflix Streaming Pipeline!")
