In [None]:
# batch_pipeline_netflix.ipynb

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count

# -------------------------------------------------------------------
# 1. Spark session with both GCS + BigQuery connectors
# -------------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("NetflixBatchPipeline")
        .master("spark://spark-master:7077")
        .config("spark.driver.memory", "4g")
        .config("spark.executor.memory", "4g")
        # Explicitly include the GCS connector we just installed
        .config("spark.jars", "/usr/local/spark/jars/gcs-connector-hadoop3-latest.jar")
        # Enable authentication with your GCP key
        .config("spark.hadoop.google.cloud.auth.service.account.enable", "true")
        .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile",
                "/home/jovyan/work/notebooks/keys/service-account.json")
        .getOrCreate()
)

# -------------------------------------------------------------------
# 2. Read Netflix data from your GCS bucket
# -------------------------------------------------------------------
netflix_bucket = "gs://netflix_data_25/"
users_df  = spark.read.option("header", True).csv(f"{netflix_bucket}users.csv")
movies_df = spark.read.option("header", True).csv(f"{netflix_bucket}movies.csv")
watch_df  = spark.read.option("header", True).csv(f"{netflix_bucket}watch_history.csv")

print(f"Users: {users_df.count()}  Movies: {movies_df.count()}  Watch: {watch_df.count()}")

# -------------------------------------------------------------------
# 3. Join + aggregate (example transformation)
# -------------------------------------------------------------------
joined_df = (
    watch_df.join(users_df, "user_id", "left")
             .join(movies_df, "movie_id", "left")
)

agg_df = (
    joined_df.groupBy("country", "subscription_plan")
             .agg(
                 count("*").alias("total_sessions"),
                 avg(col("progress_percentage")).alias("avg_progress")
             )
)

agg_df.show(10, truncate=False)

# -------------------------------------------------------------------
# 4. Write result to BigQuery
# -------------------------------------------------------------------
(
    agg_df.write.format("bigquery")
        .option("table", "de2025-471807.netflix.country_engagement_summary")
        .option("temporaryGcsBucket", "netflix-group5-temp")   # temp bucket you create once
        .mode("overwrite")
        .save()
)

spark.stop()
