### Imports

In [None]:
import os
import sys
import json
from datetime import datetime
import pyspark.sql.functions as fun

In [None]:
sys.path.append(os.path.abspath('..'))  # adds the parent folder to sys.path
from src.utils import dataframe_utils
from src.utils.path_utils import find_project_root,Path

### Configurations and Parameters

In [None]:
# parameters
base_dir='../'
config_path=None

In [None]:

if config_path is not None:
    BASE_DIR = Path(base_dir)
    config_path
else:
    # fallback: find project root and load config.json
    BASE_DIR = find_project_root()
    config_path = BASE_DIR / 'configurations' / 'config.json'

with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

print(f"Loaded config from {config_path}")

In [None]:
downloaded_file_name='Rate_PUF.csv'
silver_path = os.path.join(BASE_DIR,config["silver_parquet_path"], downloaded_file_name.split('.')[0])
gold_path = os.path.join(BASE_DIR,config["gold_parquet_path"], downloaded_file_name.split('.')[0])

### Gold layer transformation
#### combining all columns into single column 

In [None]:
gold_rates_df=dataframe_utils.read_data_spark(file_path=silver_path,
                                              file_format='parquet',
                                              header=True,
                                              inferSchema=True)
gold_rates_df.printSchema()

In [None]:
columns_to_use = [col for col in gold_rates_df.columns if col != "ImportDate"]

# Build full_text expression safely with explicit string casting
full_text_expr = fun.concat_ws(" | ", *[
                                        fun.concat(fun.lit(f"{col_name}: "), 
                                        fun.coalesce(fun.col(col_name).cast("string"), fun.lit("")))
                                        for col_name in columns_to_use
                            ])

gold_rates_df = gold_rates_df.withColumn("full_text", full_text_expr)
gold_rates_df = gold_rates_df.withColumn("row_id", fun.monotonically_increasing_id()).orderBy("row_id")

In [None]:
# save as Gold dataset as both parquet

dataframe_utils.write_data_spark(file_path=gold_path,
                                file_format='parquet',
                                df=gold_rates_df.coalesce(8),  # Adjust 8 to a lower number if needed or any heap memory issues,
                                mode='overwrite',
                                partition_by=['ImportDate','StateCode','Age'],
                                header=True)

print(f"Saved Gold parquet at {gold_path}")