In [None]:
# TEST CELL 1: Verify Spark and GCS Connection

from pyspark import SparkConf
from pyspark.sql import SparkSession

GCS_PATH = "gs://data_netflix_2025/streaming"
GCS_TEMP_PATH = "gs://temp_netflix_2025"

print("Testing Spark connection...")

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("StreamingTest")
sparkConf.set("spark.driver.memory", "2g")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Configure GCS
spark.conf.set('temporaryGcsBucket', GCS_TEMP_PATH)
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print(f"âœ… Spark session created: {spark.version}")
print(f"âœ… Application ID: {spark.sparkContext.applicationId}")

In [None]:
# TEST CELL 2: Read Streaming Data as Batch (Static)

print(f"Reading from: {GCS_PATH}")

# Read all JSON files as a static DataFrame
df_static = spark.read.json(GCS_PATH)

print(f"\nâœ… Successfully read data!")
print(f"   Total rows: {df_static.count()}")
print(f"   Columns: {len(df_static.columns)}")

print("\nðŸ“‹ Schema:")
df_static.printSchema()

print("\nðŸ“Š Sample data (first 5 rows):")
df_static.select("session_id", "user_id", "movie_id", "timestamp", "action").show(5, truncate=False)

print("\nðŸ“ˆ Quick stats:")
print(f"   Unique users: {df_static.select('user_id').distinct().count()}")
print(f"   Unique movies: {df_static.select('movie_id').distinct().count()}")
print(f"   Action distribution:")
df_static.groupBy("action").count().orderBy("count", ascending=False).show()