In [None]:
# batch_pipeline_netflix.ipynb

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when

# 1. Start Spark session
spark = (
    SparkSession.builder
        .appName("NetflixBatchPipeline")
        .master("spark://spark-master:7077")
        .config("spark.driver.memory", "2g")
        .config("spark.executor.memory", "2g")
        .getOrCreate()
)

# 2. Read data from GCS or local volume (adjust path)
# netflix_path = "gs://de25-group5-raw/netflix/"
netflix_path = "/home/jovyan/work/data/" # path to google bucket

users_df = spark.read.option("header", True).csv(f"{netflix_path}users.csv")
movies_df = spark.read.option("header", True).csv(f"{netflix_path}movies.csv")
watch_df = spark.read.option("header", True).csv(f"{netflix_path}watch_history.csv")

print("Loaded datasets:")
print(f"Users: {users_df.count()}  Movies: {movies_df.count()}  Watch: {watch_df.count()}")

# 3. Quick cleaning and join
# Example: join watch_history with users and movies
joined_df = (
    watch_df.join(users_df, "user_id", "left")
             .join(movies_df, "movie_id", "left")
)

# 4. Simple aggregation example
agg_df = (
    joined_df.groupBy("country", "subscription_plan")
             .agg(
                 count("*").alias("total_sessions"),
                 avg(col("progress_percentage")).alias("avg_progress")
             )
)

agg_df.show(10, truncate=False)

# 5. Write result to BigQuery
# (youâ€™ll adjust project and dataset names)
agg_df.write.format("bigquery") \
    .option("table", "de25_group5.batch_country_progress") \
    .option("temporaryGcsBucket", "de25-group5-temp") \
    .mode("overwrite") \
    .save()

spark.stop()