In [0]:
from datetime import datetime , timedelta
from pyspark.sql.functions import col, lit, input_file_name, current_timestamp,regexp_extract, to_date
dbutils.widgets.text("start_date", datetime.now().strftime("%Y-%m-%d"),"Start date")
dbutils.widgets.text("end_date", datetime.now().strftime("%Y-%m-%d"),"End date")

dbutils.widgets.text("mode", "INCREMENTAL","mode")

start_date_str = dbutils.widgets.get("start_date")
end_date_str = dbutils.widgets.get("end_date")
mode = dbutils.widgets.get("mode")

date_format = "%Y-%m-%d"

start_date = datetime.strptime(start_date_str, date_format).date()
end_date = datetime.strptime(end_date_str, date_format).date()

if start_date > end_date:
    raise ValueError(f"CRITICAL CONFIG ERROR: Start Date ({start_date}) is after End Date ({end_date}). Please check your parameters.")


In [0]:


ACCESS_KEY = dbutils.secrets.get(scope = "ticker", key = "access_key")
SECRET_KEY = dbutils.secrets.get(scope = "ticker", key = "secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope = "ticker", key = "session_key")

temp_ak = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_ak", debugValue="debug-key")
temp_sk = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_sk", debugValue="debug-secret")
temp_token = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_token", debugValue="debug-token")



Schema Strategy: The "Wide Net" Policy
We replaced the strict StructType enforcement with a Permissive Read strategy to satisfy the "Bronze Policy". Hardcoding the schema at this stage is dangerous because a single unexpected data type (e.g., a string in an integer column) would cause the entire batch to fail or silently drop data. By enabling mode="PERMISSIVE" and capturing malformed records in _rescued_data, we ensure the pipeline never crashes on schema drift, allowing us to debug issues later without losing raw data.

In [0]:
master_schema = spark.table("company_financials_master_def").schema
base_path = "s3a://mzon-to-databricks-5482/landing/source=fmp/ticker=*"
if mode == "INCREMENTAL":
    if start_date == end_date:
        s3_path = [f"{base_path}/date={start_date}"]
    else:
        delta = (end_date - start_date).days
        # FIXED: Loop now includes start_date (i) instead of skipping it (i+1)
        date_list = [start_date + timedelta(days=i) for i in range(delta + 1)]
        s3_path = [f"{base_path}/date={d}" for d in date_list]
else:
    # Disaster Recovery: Full Reload
    s3_path = [f"{base_path}/**/**"]
print(s3_path)

df = (spark.read
    
 
       .format("text")
       .option("wholetext", "true")
      .option("fs.s3a.access.key", ACCESS_KEY)
      .option("fs.s3a.secret.key", SECRET_KEY)
      .option("fs.s3a.session.token", SESSION_TOKEN)
      .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
      .load(s3_path))



df.printSchema()
df.show()

Metadata Extraction: Enabling Partitioning
We introduced a transformation step to derive the date column directly from the source file path (using Regex). Since the raw JSON data inside the files does not guarantee a clean date field, relying on the folder structure (.../date=2025-12-21/...) is the only way to accurately map files to their correct partitions. This step is a prerequisite for "Protocol A," as the writer needs this column to know exactly where to place the data in the Delta table.

In [0]:
df_final = df.withColumn("ingestion_timestamp",current_timestamp()) \
             .withColumn("source_file", col("_metadata.file_path")) \
             .withColumn("date", to_date(regexp_extract(col("_metadata.file_path"), "date=(\\d{4}-\\d{2}-\\d{2})", 1), "yyyy-MM-dd")) \
             .withColumn("symbol", regexp_extract(col("_metadata.file_path"), "ticker=([^/]+)", 1)) \
             .withColumn("statement_type", regexp_extract(col("_metadata.file_path"), "statement=([^/]+)", 1))
df_final.printSchema()
df_final.show()

Write Strategy: Enforcing "Protocol A" (Idempotency)
We upgraded the write mode from a simple append to the "Surgical Backfill" standard. Using append is risky because re-running a job (e.g., after a fix) creates duplicate records. We now use .mode("overwrite") with the replaceWhere condition. This instructs Delta Lake to surgically delete and replace only the data for the specific dates being processed, guaranteeing that we can run the pipeline multiple times safely without creating duplicates

In [0]:

destination_path = f"s3a://mzon-to-databricks-5482/bronze/source=fmp/"

write_writer = (df_final.write
    .format("delta")
    .option("mergeSchema", "true") # [cite: 119]
   .option("fs.s3a.access.key", ACCESS_KEY)
      .option("fs.s3a.secret.key", SECRET_KEY)
      .option("fs.s3a.session.token", SESSION_TOKEN)
    .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
)

# PROTOCOL A: SURGICAL BACKFILL 
if mode == "INCREMENTAL":
    # We use replaceWhere to safely overwrite ONLY the days we are re-processing.
    # This ensures Idempotency.
    condition = f"date >= '{start_date}' AND date <= '{end_date}'"
    (write_writer
     .mode("overwrite")
     .option("replaceWhere", condition)
     .partitionBy("date", "symbol") # Partition by Date is REQUIRED for this to work efficiently
     .save(destination_path))
else:
    # FULL RELOAD [cite: 156]
    (write_writer
     .mode("overwrite")
     .option("overwriteSchema", "true")
     .partitionBy("date", "symbol")
     .save(destination_path))