In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as sql_max, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import datetime

# Initialize Spark session
spark = SparkSession.builder.appName("Raw to Bronze ETL").getOrCreate()

# Redshift JDBC Connection Details
redshift_jdbc_url = "jdbc:redshift://redshift-cluster-id.end-point.aws-region.redshift.amazonaws.com:port-number/redshift-database-name"
redshift_properties = {
    "user": "redshift_user_name",
    "password": "redshift-password",
    "driver": "com.amazon.redshift.jdbc.Driver"
}

table_name = "raw_to_bronze"

# Fetch latest ETL timestamp from Redshift
def get_latest_etl_timestamp():
    try:
        query = f"(SELECT MAX(timestamp) AS last_run FROM gold.etl_tracker WHERE table_name = '{table_name}') AS last_etl"
        etl_tracker_df = spark.read.jdbc(url=redshift_jdbc_url, table=query, properties=redshift_properties)
        
        # Extract latest timestamp
        if etl_tracker_df.count() > 0 and etl_tracker_df.collect()[0]["last_run"] is not None:
            return etl_tracker_df.collect()[0]["last_run"]
        return None
    except Exception as e:
        print(f"Error fetching latest ETL timestamp from Redshift: {str(e)}")
        return None

latest_etl_timestamp = get_latest_etl_timestamp()
print(f"✅ Latest ETL Timestamp from Redshift: {latest_etl_timestamp}")

# Define paths
source_path = "dbfs:/FileStore/tables/raw_data1/"
bronze_path = "dbfs:/FileStore/tables/bronze_layer1/"

# List files in Raw Layer
raw_files = dbutils.fs.ls(source_path)

# Filter files based on modification time (for incremental load)
if latest_etl_timestamp:
    new_files = [f for f in raw_files if datetime.datetime.fromtimestamp(f.modificationTime / 1000.0) > latest_etl_timestamp]
else:
    new_files = raw_files  # Full load if no previous ETL run found

if new_files:
    print(f"🚀 Processing {len(new_files)} new/modified files for Bronze Layer.")

    # Copy new files from Raw to Bronze and keep track of their paths
    copied_files = []
    
    for file in new_files:
        target_path = f"{bronze_path}{file.name}"
        dbutils.fs.cp(file.path, target_path)
        copied_files.append(target_path)

    # Get the current timestamp for ETL tracking
    current_etl_timestamp = datetime.datetime.utcnow()
    print(f"✅ Current ETL Timestamp: {current_etl_timestamp}")

    # Update ETL Tracker in Redshift
    try:
        df_update = spark.createDataFrame(
            [(table_name, current_etl_timestamp, 'bronze', 'success')],
            ["table_name", "timestamp", "layer", "status"]
        )

        # Write to Redshift
        df_update.write.jdbc(
            url=redshift_jdbc_url,
            table="gold.etl_tracker",
            mode="append",
            properties=redshift_properties
        )

        print(f"✅ ETL Tracker updated in Redshift with timestamp: {current_etl_timestamp}")
    except Exception as e:
        print(f"❌ Error updating ETL tracker in Redshift: {str(e)}")
else:
    print("✅ No new files found. Nothing to copy.")


✅ Latest ETL Timestamp from Redshift: None
🚀 Processing 1 new/modified files for Bronze Layer.
✅ Current ETL Timestamp: 2025-02-27 07:36:39.256743
✅ ETL Tracker updated in Redshift with timestamp: 2025-02-27 07:36:39.256743
