In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/wt_logger"

In [0]:
from pyspark.sql.functions import input_file_name, regexp_extract, lit

class RawDataIngestor:
    def __init__(self, spark):
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()
        self.logger.info("RawDataIngestor initialized.")

    def load_original_turbine_data(self, directory_path):
        """ Loads raw original dataset files into a Bronze table. """

        self.logger.info(f"Loading original turbine data from {directory_path}")

        df = (self.spark.read.format("csv")
              .option("header", "true")
              .option("inferSchema", "true")
              .load(f"{directory_path}/data_group_*.csv"))

        self.logger.info(f"Successfully loaded {df.count()} original turbine dataset records.")

        return df
    
    def load_new_turbine_data(self, directory_path):
        """ Loads raw new dataset files into a Bronze table. """

        self.logger.info(f"Loading new turbine data from {directory_path}")

        df = (self.spark.read.format("csv")
              .option("header", "true")
              .option("inferSchema", "true")
              .load(f"{directory_path}/Location*.csv"))

        self.logger.info(f"Successfully loaded {df.count()} new turbine dataset records.")

        # Get the maximum turbine_id from the original dataset
        max_existing_id = df_original_turbine_data.agg({"turbine_id": "max"}).collect()[0][0]
        if max_existing_id is None:
            max_existing_id = 0  # In case no existing IDs

        self.logger.info(f"Maximum ID number from original dataset: {max_existing_id}")
        
        # Assign turbine ID based on file name 

        df = df.withColumn("turbine_id",
                           (regexp_extract(input_file_name(), "Location(\d+)", 1).cast("int") + lit(max_existing_id)))

        self.logger.info(f"Successfully added turbine_id to new turbine dataset records.")

        return df

    def write_bronze(self, df, table_name):
        """ Writes the given DataFrame to a Bronze Delta Table. """
        
        self.logger.info(f"Writing data to Delta table: bronze_data.{table_name}")
        self.spark.sql("CREATE SCHEMA IF NOT EXISTS bronze_data")
        df.write.mode("overwrite").format("delta").saveAsTable(f"bronze_data.{table_name}")
        self.logger.info(f"Successfully written data to bronze_data.{table_name}")
