In [None]:
# Cell 1: Import Libraries & Configuration
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import time

# ==========================================
# CONFIGURATION
# ==========================================
project_id = "agile-producer-471907-s7"
bq_dataset = "netflix"
temp_bucket = "temp_netflix_2025"  # Your temp bucket for BigQuery
gcs_bucket = "gs://data_netflix_2025/raw"  # Your GCS bucket with /raw folder

print("✅ Configuration loaded")
print(f"   Project: {project_id}")
print(f"   Dataset: {bq_dataset}")
print(f"   GCS Bucket: {gcs_bucket}")

In [None]:
# Cell 2: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Netflix_Batch_Pipeline") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .config("spark.driver.cores", "1") \
    .getOrCreate()

# Set temp bucket for BigQuery
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Configure GCS access
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print("✅ Spark session created")
print(f"   Spark UI: http://<your-vm-ip>:4040")
print(f"   Master: spark://spark-master:7077")

In [None]:
# Cell 3: Test GCS Connection
# List files in your GCS bucket to verify access
from subprocess import check_output

try:
    # Test by reading a small sample
    test_df = spark.read.option("header", True).csv(f"{gcs_bucket}/users.csv").limit(5)
    print("✅ GCS Connection Successful!")
    print(f"   Read {test_df.count()} sample rows from users.csv")
    test_df.show()
except Exception as e:
    print("❌ GCS Connection Failed!")
    print(f"   Error: {str(e)}")