In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count

# -------------------------------------------------------------------
# 1. Spark session (no GCS, no BigQuery)
# -------------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("NetflixBatchPipelineLocal")
        .master("spark://spark-master:7077")
        .config("spark.driver.memory", "2g")
        .config("spark.executor.memory", "2g")
        .getOrCreate()
)

# -------------------------------------------------------------------
# 2. Read Netflix data from local Docker-mounted /data folder
# -------------------------------------------------------------------
# Inside the Jupyter container, /home/jovyan/work/data/ maps to your host ~/data
netflix_path = "/home/jovyan/data/"

users_df  = spark.read.option("header", True).csv(f"{netflix_path}users.csv")
movies_df = spark.read.option("header", True).csv(f"{netflix_path}movies.csv")
watch_df  = spark.read.option("header", True).csv(f"{netflix_path}watch_history.csv")

print(f"Users: {users_df.count()}  Movies: {movies_df.count()}  Watch: {watch_df.count()}")

# -------------------------------------------------------------------
# 3. Example transformation: join and aggregate
# -------------------------------------------------------------------
joined_df = (
    watch_df.join(users_df, "user_id", "left")
             .join(movies_df, "movie_id", "left")
)

agg_df = (
    joined_df.groupBy("country", "subscription_plan")
             .agg(
                 count("*").alias("total_sessions"),
                 avg(col("progress_percentage")).alias("avg_progress")
             )
)

agg_df.show(10, truncate=False)

# -------------------------------------------------------------------
# 4. Optional: write results locally to CSV (for debugging)
# -------------------------------------------------------------------
output_path = "/home/jovyan/data/output_netflix_summary"
agg_df.coalesce(1).write.mode("overwrite").option("header", True).csv(output_path)

print(f"âœ… Results written to: {output_path}")