In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col, count, when, isnan, isnull

# -------------------------------------------------------------------
# 1. Configuration
# -------------------------------------------------------------------
project_id = "de2025-471807"
bq_dataset_raw = "netflix"  # Dataset for raw data
bq_dataset_processed = "netflix_processed"  # Dataset for cleaned/processed data
temp_bucket = "netflix-group5-temp"
processed_path = "/home/jovyan/data/processed/"  # Optional: also save to local CSV

# -------------------------------------------------------------------
# 2. Spark session setup with BigQuery
# -------------------------------------------------------------------
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("DataQualityCheck")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print("‚úÖ Spark session created with BigQuery support")


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
# -------------------------------------------------------------------
# 3. Load all tables from BigQuery
# -------------------------------------------------------------------
tables = {
    "users": "Users",
    "movies": "Movies",
    "watch_history": "WatchHistory",
    "recommendation_logs": "RecommendationLogs",
    "reviews": "Reviews",
    "search_logs": "SearchLogs"
}

dataframes = {}
for name, table_name in tables.items():
    df = spark.read.format("bigquery").load(f"{project_id}.{bq_dataset_raw}.{table_name}")
    dataframes[name] = df
    print(f"‚úÖ Loaded {name}: {df.count()} rows, {len(df.columns)} columns")


NameError: name 'spark' is not defined

In [None]:
# -------------------------------------------------------------------
# 4. Check missing values and duplicates
# -------------------------------------------------------------------
from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType, DecimalType

def check_data_quality(df, name):
    print(f"\nüìä Data Quality Report: {name}")
    total_rows = df.count()
    print(f"   Total rows: {total_rows}")
    print(f"   Total columns: {len(df.columns)}")
    
    # Missing values per column
    missing_counts = {}
    for col_name in df.columns:
        col_type = dict(df.dtypes)[col_name]
        col_expr = col(col_name)
        
        # Check if column is numeric (can use isnan)
        is_numeric = col_type in ['double', 'float', 'int', 'bigint', 'decimal']
        
        if is_numeric:
            # For numeric columns, check both null and nan
            missing = df.filter(col_expr.isNull() | isnan(col_expr)).count()
        else:
            # For non-numeric columns, only check null
            missing = df.filter(col_expr.isNull()).count()
        
        if missing > 0:
            missing_counts[col_name] = missing
    
    if missing_counts:
        print(f"   ‚ö†Ô∏è  Missing values found:")
        for col_name, count in missing_counts.items():
            pct = (count / total_rows) * 100
            print(f"      - {col_name}: {count} ({pct:.1f}%)")
    else:
        print(f"   ‚úÖ No missing values")
    
    # Duplicates
    duplicate_count = total_rows - df.dropDuplicates().count()
    if duplicate_count > 0:
        pct = (duplicate_count / total_rows) * 100
        print(f"   ‚ö†Ô∏è  Duplicates: {duplicate_count} rows ({pct:.1f}%)")
    else:
        print(f"   ‚úÖ No duplicates")
    
    return missing_counts, duplicate_count

# Check all dataframes
quality_reports = {}
for name, df in dataframes.items():
    missing, duplicates = check_data_quality(df, name)
    quality_reports[name] = {"missing": missing, "duplicates": duplicates}


In [None]:
# -------------------------------------------------------------------
# 5. Clean data: Remove missing values, empty columns, and duplicates
# -------------------------------------------------------------------
def clean_dataframe(df, critical_columns=None):
    """
    Clean dataframe by removing:
    - Columns that are entirely null
    - Rows with missing values in critical columns (or all columns if not specified)
    - Duplicate rows
    """
    # Remove columns that are entirely null
    total_rows = df.count()
    columns_to_keep = []
    for col_name in df.columns:
        null_count = df.filter(col(col_name).isNull()).count()
        if null_count < total_rows:  # Keep column if it has at least one non-null value
            columns_to_keep.append(col_name)
    
    df_clean = df.select(columns_to_keep)
    
    # Remove rows with missing values
    # If critical_columns specified, only check those; otherwise check all columns
    if critical_columns:
        # Only remove rows where critical columns are missing
        condition = None
        for col_name in critical_columns:
            if col_name in df_clean.columns:
                col_expr = col(col_name)
                col_type = dict(df_clean.dtypes)[col_name]
                is_numeric = col_type in ['double', 'float', 'int', 'bigint', 'decimal']
                
                if is_numeric:
                    col_condition = col_expr.isNull() | isnan(col_expr)
                else:
                    col_condition = col_expr.isNull()
                
                if condition is None:
                    condition = col_condition
                else:
                    condition = condition | col_condition
        
        if condition is not None:
            df_clean = df_clean.filter(~condition)
    else:
        # Remove rows with any missing values (original behavior)
        df_clean = df_clean.dropna()
    
    # Remove duplicate rows
    df_clean = df_clean.dropDuplicates()
    
    return df_clean

# Define critical columns for each table (columns that must not be null)
critical_columns_map = {
    "users": ["user_id", "email"],  # User must have ID and email
    "movies": ["movie_id", "title"],  # Movie must have ID and title
    "watch_history": ["session_id", "user_id", "movie_id"],  # Watch session must have these
    "recommendation_logs": ["user_id", "movie_id"],  # Recommendation must have user and movie
    "reviews": ["user_id", "movie_id"],  # Review must have user and movie
    "search_logs": ["user_id"]  # Search must have user
}

cleaned_dataframes = {}
for name, df in dataframes.items():
    original_count = df.count()
    original_cols = len(df.columns)
    critical_cols = critical_columns_map.get(name, None)
    df_clean = clean_dataframe(df, critical_columns=critical_cols)
    cleaned_count = df_clean.count()
    cleaned_cols = len(df_clean.columns)
    cleaned_dataframes[name] = df_clean
    
    removed_rows = original_count - cleaned_count
    removed_cols = original_cols - cleaned_cols
    print(f"‚úÖ {name}: {original_count} ‚Üí {cleaned_count} rows, {original_cols} ‚Üí {cleaned_cols} cols (removed {removed_rows} rows, {removed_cols} cols)")


In [None]:
# -------------------------------------------------------------------
# 6. Save cleaned data to BigQuery
# -------------------------------------------------------------------
print("\nüì§ Writing cleaned data to BigQuery...")

# Map table names for BigQuery (use same names as raw, or add suffix)
table_name_map = {
    "users": "Users",
    "movies": "Movies",
    "watch_history": "WatchHistory",
    "recommendation_logs": "RecommendationLogs",
    "reviews": "Reviews",
    "search_logs": "SearchLogs"
}

for name, df_clean in cleaned_dataframes.items():
    table_name = table_name_map.get(name, name.capitalize())
    bq_table = f"{project_id}.{bq_dataset_processed}.{table_name}"
    
    print(f"   Writing {name} to {bq_table}...")
    df_clean.write.format('bigquery') \
        .option('table', bq_table) \
        .mode("overwrite") \
        .save()
    print(f"   ‚úÖ {name} written successfully ({df_clean.count()} rows)")

print(f"\n‚úÖ All cleaned data written to BigQuery dataset: {bq_dataset_processed}")
print("\nüéâ Data quality check and cleaning completed!")
