### Imports

In [None]:
import os
import sys
import json
from datetime import datetime
import pyspark.sql.functions as fun

In [None]:
sys.path.append(os.path.abspath('..'))  # adds the parent folder to sys.path
from src.utils import dataframe_utils
from src.utils.path_utils import find_project_root,Path
from src.utils.dataquality import data_quality_checks

### Configuration and Parameters

In [None]:
# parameters
base_dir='../'
config_path=None

In [None]:

if config_path is not None:
    BASE_DIR = Path(base_dir)
    config_path
else:
    # fallback: find project root and load config.json
    BASE_DIR = find_project_root()
    config_path = BASE_DIR / 'configurations' / 'config.json'

with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

print(f"Loaded config from {config_path}")

In [None]:
downloaded_file_name='Rate_PUF.csv'
raw_csv_path = os.path.join(BASE_DIR,config["download_dir"], downloaded_file_name)
silver_path = os.path.join(BASE_DIR,config["silver_parquet_path"], downloaded_file_name.split('.')[0])

### Cleaning/Transformation

In [None]:
# Read using PySpark
read_df=dataframe_utils.read_data_spark(file_path=raw_csv_path,
                                          file_format="csv",
                                          header=True,
                                          inferSchema=True)

In [None]:
# Convert all columns to string type
rates_df = read_df.select([fun.col(c).cast("string") for c in read_df.columns])

# Fill all null values with empty strings
silver_rates_df = rates_df.fillna("")

# Get today's date in YYYYMMDD format
today_str = datetime.today().strftime("%Y%m%d")

# Add column with the same date for all rows
silver_rates_df = rates_df.withColumn("ImportDate", fun.lit(today_str))

In [None]:
# save as Silver dataset
dataframe_utils.write_data_spark(file_path=silver_path,
                                 file_format='parquet',
                                 df=silver_rates_df.coalesce(4),  # Adjust 4 to a lower number if needed or any heap memory issues
                                 mode='overwrite',
                                 partition_by=['ImportDate','StateCode','Age'],
                                 header=True)

print(f"Saved silver parquet at {silver_path}")

### Data Quality check

In [None]:
if data_quality_checks(silver_rates_df):
    print("All Data Quality Checks Passed ✅")
    
