In [0]:
from datetime import datetime , timedelta
from pyspark.sql import Window
from pyspark.sql.functions import col, lit, input_file_name, current_timestamp,regexp_extract, to_date , lag,explode , from_json , nullif
from pyspark.sql.types import ArrayType , StringType , StructField , StructType
dbutils.widgets.text("start_date", datetime.now().strftime("%Y-%m-%d"),"Start date")
dbutils.widgets.text("end_date", datetime.now().strftime("%Y-%m-%d"),"End date")

dbutils.widgets.text("mode", "INCREMENTAL","mode")

start_date_str = dbutils.widgets.get("start_date")
end_date_str = dbutils.widgets.get("end_date")
mode = dbutils.widgets.get("mode")

date_format = "%Y-%m-%d"

start_date = datetime.strptime(start_date_str, date_format).date()
end_date = datetime.strptime(end_date_str, date_format).date()

if start_date > end_date:
    raise ValueError(f"CRITICAL CONFIG ERROR: Start Date ({start_date}) is after End Date ({end_date}). Please check your parameters.")


In [0]:





ACCESS_KEY = dbutils.secrets.get(scope = "ticker", key = "access_key")
SECRET_KEY = dbutils.secrets.get(scope = "ticker", key = "secret_key")
SESSION_TOKEN = dbutils.secrets.get(scope = "ticker", key = "session_key")

temp_ak = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_ak", debugValue="debug-key")
temp_sk = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_sk", debugValue="debug-secret")
temp_token = dbutils.jobs.taskValues.get(taskKey="Init_Auth", key="temp_token", debugValue="debug-token")




In [0]:
def load_df(path , statement , mode):


    df_silver = (spark.read
        .format("delta")
        .option("fs.s3a.access.key", temp_ak)
      .option("fs.s3a.secret.key", temp_sk)
      .option("fs.s3a.session.token", temp_token)
        .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
        .option("mode" , "PERMESSIVE")
        .option("columnNameOfCorruptRecord" , "_rescued_data")
        .load(path + f"/{statement}")  # <--- No wildcards, no date=... loops
    )

    # 2. Apply "Pushdown Predicate" (The Filter)
    # Spark sends this logic to the Delta Log BEFORE reading data.
    if mode == "INCREMENTAL":
        print(f"Filtering for range: {start_date} to {end_date}")
        df_silver = df_silver.filter(
        (col("date") >= lit(start_date)) & 
        (col("date") <= lit(end_date))
    )

    # 3. Verify
    df_silver.printSchema()
    print(f"Row Count: {df_silver.count()}")
    return df_silver


In [0]:
base_path = "s3a://mzon-to-databricks-5482/silver/valid"
df_income_statement_silver = load_df(base_path , "income_statement" , mode)
df_balance_sheet_silver = load_df(base_path , "balance_sheet" , mode)
df_cashflow_statement_silver = load_df(base_path , "cashflow_statement" , mode)

In [0]:
def schema_generator(schema):
    schema_modified = schema.add("_corrupt_record", StringType(), True)
    json_schema = ArrayType(schema_modified)
    return schema_modified , json_schema

In [0]:

gold_financial_statement_schema = spark.table("gold_financial_statement_schema_holder").schema
gold_financial_statement_schema_modified, gold_financial_statement_schema_json = schema_generator(gold_financial_statement_schema)






In [0]:
df_income_statement_silver.show()

In [0]:

df_balance_sheet_silver.show()

In [0]:
df_cashflow_statement_silver.show()

In [0]:
df_gold = df_income_statement_silver.join(other=df_balance_sheet_silver , how="inner", on=["date","symbol"]).join(other=df_cashflow_statement_silver , how="inner", on=["date","symbol"])
df_gold.printSchema()
df_gold.show()

In [0]:
WindowSpec = Window.partitionBy("symbol").orderBy("date")

df_gold = (df_gold.withColumn("nopat" , col("ebit") - col("incomeTaxExpense")) 
           .withColumn("gross_margin" , col("grossProfit") / nullif(col("revenue"),lit(0)))
           .withColumn("working_capital",col("netReceivables") + col("netReceivables") - col("accountPayables"))
           .withColumn("prev_working_capital" , lag("working_capital",1).over(WindowSpec))
           .withColumn("delta_wc" ,col("working_capital") - col("prev_working_capital"))
           .withColumn("capex" , col("capitalExpenditure"))
           .withColumn("reinvestment_rate",col("capex") / nullif(col("ebit"),lit(0)))
           .withColumn("depreciation" , col("depreciationAndAmortization"))
           .withColumn("net_debt",col("totalDebt") )
           .withColumn("liquidity_ratio", col("TotalCurrentAssets") / nullif(col("TotalCurrentLiabilities"),lit(0)))
           .withColumn("interest_coverage_ratio" , col("ebit") / nullif(col("interestExpense"),lit(0)))
           .withColumn("calculated_fcf",col("nopat") + col("depreciation") - col("delta_wc") - col("capex") )
           
           
           )
df_gold = df_gold.select("date","symbol","nopat","gross_margin","revenue", "ebit","working_capital","delta_wc","capex","reinvestment_rate","depreciation","net_debt","liquidity_ratio","interest_coverage_ratio","calculated_fcf",col("weightedAverageShsOutDil").alias("shares_outstanding"),
    col("interestExpense").alias("interest_expense"),
    col("incomeTaxExpense").alias("tax_expense"),
    col("totalDebt").alias("total_debt"))
df_gold.show()

In [0]:
from pyspark.sql.functions import col, lit, when, concat_ws,row_number
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

def align_and_validate_strict(df: DataFrame, target_schema: StructType):
    """
    STRICT VERSION:
    - If a column is missing from the input DF, the row is marked BAD (Quarantined).
    - If a column exists but has bad data (Type mismatch), the row is marked BAD.
    """
    existing_cols = df.columns
    selected_cols = []

    for field in target_schema:
        if field.name in existing_cols:
            selected_cols.append(field.name)
        else:
            selected_cols.append(lit(None).try_cast(field.dataType).alias(field.name))
        
    df_aligned = df.select(*selected_cols)
    row_validation = []
    for field in target_schema:
        col_name = field.name
        col_type = field.dataType
        is_nullable = field.nullable

        cast_result = col(col_name).try_cast(col_type).alias(col_name)
        if is_nullable:
            is_valid_rule = (col(col_name).isNull()) | (cast_result.isNotNull())
        else:
            is_valid_rule = cast_result.isNotNull()
        err_ms = when(~is_valid_rule , lit(col_name)).otherwise(lit(None))
        row_validation.append(err_ms)

    df_scored = df_aligned.withColumn("_failed_cols" , concat_ws(",",*row_validation))
    return df_scored




The function `DeltaTable.isDeltaTable(spark, path)` creates a fresh connection to S3 to check for the `_delta_log` folder.

**The Problem**: This new connection does not know about the AWS keys (ACCESS_KEY, etc.) injected into the reader. It attempts an anonymous connection and is rejected by AWS.

**The Constraint**: On a Shared Cluster, global keys (`spark.conf.set`) are banned, making DeltaTable utilities effectively absent.

In [0]:
def clean_df(df):
    df_deduplicated = df
    window_spec = Window.partitionBy("symbol","date") \
                        .orderBy(col("date").desc())
    df_deduplicated = df_deduplicated.withColumn("_rank", row_number().over(window_spec)) \
                                    .filter(col("_rank") == "1") \
                                    .drop("_rank")
    return df_deduplicated


In [0]:
def validate_df(df,schema):
    df_evaluated = align_and_validate_strict(df, schema)
    #PERSIST TABLE is not supported on serverless compute. SQLSTATE: 0A000
    #df_evaluated.cache()
    valid_records = df_evaluated.filter(col("_failed_cols") == "").drop("_failed_cols")
    invalid_records = df_evaluated.filter((col("_failed_cols") != "") | (col("_corrupt_record").isNotNull()) )
    # df_evaluated.unpersist()
    return valid_records, invalid_records


In [0]:


df_gold_cleaned = clean_df(df_gold) 
df_gold_cleaned.show()
df_gold_valid_records, df_gold_invalid_records = validate_df(df_gold_cleaned,gold_financial_statement_schema_modified)

df_gold_valid_records.show()

df_gold_invalid_records.show()


### Write Strategy: Credential-Injection Overwrite

We use **Overwrite by Partition** with a `replaceWhere` condition. This achieves Idempotency while allowing us to explicitly pass the `fs.s3a.access.key` credentials in the `.write` options, bypassing the need for an Instance Profile.

In [0]:
def write_df(df,label,path,mode):
    if mode == "INCREMENTAL" : 
            (df.write
            .format("delta")
            .mode("overwrite")
            .partitionBy("date")
            # CRITICAL: This condition ensures we only overwrite the partitions present in the current batch
            .option("replaceWhere", f"date >= '{start_date}' AND date <= '{end_date}'")
            # INJECT CREDENTIALS AGAIN (Required for the Writer)
            .option("fs.s3a.access.key", temp_ak)
            .option("fs.s3a.secret.key", temp_sk)
            .option("fs.s3a.session.token", temp_token)
            .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")

            .save(path + f"/{label}"))
    else:
        (df.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("date")
        # INJECT CREDENTIALS AGAIN (Required for the Writer)
       .option("fs.s3a.access.key", temp_ak)
      .option("fs.s3a.secret.key", temp_sk)
      .option("fs.s3a.session.token", temp_token)
        .option("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
        .option("overwriteSchema", "true")
        .save(path + f"/{label}")
    )



In [0]:
df_gold_valid_records.printSchema()
df_gold_valid_records.show()
bronze_path_invalid = "s3a://mzon-to-databricks-5482/gold"
write_df(df_gold_valid_records,"valid",bronze_path_invalid,mode)




In [0]:
df_gold_invalid_records.printSchema()
df_gold_invalid_records.show()
bronze_path_invalid = "s3a://mzon-to-databricks-5482/gold"
write_df(df_gold_invalid_records,"invalid",bronze_path_invalid,"FULL")

In [0]:
print(df_gold_valid_records.count() / (df_gold_invalid_records.count() + df_gold_valid_records.count()))
 