# Synchronization Pipeline: S3 to Databricks Volumes

## Objective
This pipeline mirrors the "Golden" S3 data into Databricks Volumes (`/Volumes/...`).

## Architectural Benefit
By syncing data to Unity Catalog Volumes, we decouple interactive querying from AWS Key management. Users and Dashboards can query the Volume-backed SQL Views without requiring injected session tokens, enabling a seamless "Serverless-like" experience.

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# --- 1. CONFIGURATION & AUTH ---
# We need keys ONE LAST TIME to read the source S3 data.

ACCESS_KEY = dbutils.secrets.get(scope = "ticker", key = "access_key")
SECRET_KEY = dbutils.secrets.get(scope = "ticker", key = "secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope = "ticker", key = "session_key")

# Or use TaskValues if running in a job chain
try:
    temp_ak = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_ak")
    temp_sk = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_sk")
    temp_token = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_token")
except:
    # Fallback for manual run
    temp_ak, temp_sk, temp_token = ACCESS_KEY, SECRET_KEY, SESSION_TOKEN

# --- 2. DEFINITIONS ---

# ROOT PATHS (Extracted from your code)
S3_BASE_SILVER = "s3a://mzon-to-databricks-5482/silver/valid"
S3_BASE_GOLD   = "s3a://mzon-to-databricks-5482/gold/valid"

# VOLUME PATHS (Target Destination)
VOL_BASE = "/Volumes/workspace/default/storage"

# MAPPING: Source S3 -> Target Volume -> View Name
sync_map = [
    {
        "name": "Silver Income Statement",
        "s3_path": f"{S3_BASE_SILVER}/income_statement",
        "vol_path": f"{VOL_BASE}/silver/income_statement",
        "view_name": "silver_income_statement"
    },
    {
        "name": "Silver Balance Sheet",
        "s3_path": f"{S3_BASE_SILVER}/balance_sheet",
        "vol_path": f"{VOL_BASE}/silver/balance_sheet",
        "view_name": "silver_balance_sheet"
    },
    {
        "name": "Silver Cash Flow",
        "s3_path": f"{S3_BASE_SILVER}/cashflow_statement",
        "vol_path": f"{VOL_BASE}/silver/cashflow_statement",
        "view_name": "silver_cashflow_statement"
    },
    {
        "name": "Gold Financial Ratios",
        "s3_path": f"{S3_BASE_GOLD}", # Gold path was direct, not nested
        "vol_path": f"{VOL_BASE}/gold/financial_ratios_batch",
        "view_name": "gold_financial_ratios"
    }
]

# --- 3. EXECUTION ENGINE ---

def sync_and_register(meta):
    print(f"--- Processing: {meta['name']} ---")
    
    # A. READ from S3 (With Keys)
    try:
        df_source = (spark.read
            .format("delta")
            .option("fs.s3a.access.key", temp_ak)
            .option("fs.s3a.secret.key", temp_sk)
            .option("fs.s3a.session.token", temp_token)
            .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
            .load(meta['s3_path'])
        )
        print(f"1. Read S3 Success. Rows: {df_source.count()}")
    except Exception as e:
        print(f"SKIP: Could not read S3 path {meta['s3_path']}. Error: {e}")
        return

    # B. WRITE to Volume (No Keys Needed)
    # We use 'overwrite' to ensure the Volume is an exact mirror of S3
    (df_source.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true") 
        .save(meta['vol_path'])
    )
    print(f"2. Wrote to Volume: {meta['vol_path']}")

    # C. CREATE VIEW (The Pointer)
    # We use backticks ` ` for the volume path
    spark.sql(f"""
        CREATE OR REPLACE VIEW {meta['view_name']} 
        AS SELECT * FROM delta.`{meta['vol_path']}`
    """)
    print(f"3. Registered View: {meta['view_name']}")
    
    # D. OPTIMIZE (Performance)
    spark.sql(f"OPTIMIZE delta.`{meta['vol_path']}`")
    print("4. Optimization Complete")
    print("-" * 30)

# --- 4. RUN ---

for item in sync_map:
    sync_and_register(item)

print("\nSUCCESS: All S3 tables synced to Volume and Views created.")