In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/wt_logger"

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import regexp_extract, lit, col

class RawDataIngestor:
    def __init__(self, spark, config=None):
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()
        self.config = config or {}
        self.logger.info("RawDataIngestor initialized.")

    def load_original_turbine_data(self, directory_path: str) -> DataFrame:
        """
        Loads the original dataset from CSV with schema inference.
        Assumes the CSV already has a 'turbine_id' column.
        """
        try:
            self.logger.info(f"Loading original turbine data from {directory_path} (with inferSchema)")
            df = (self.spark.read
                  .format("csv")
                  .option("header", "true")
                  .option("inferSchema", "true")
                  .option("cloudFiles.inferColumnTypes", "true")  # Enable file metadata
                  .load(f"{directory_path}/data_group_*.csv")
                  .cache())

            record_count = df.count()
            self.logger.info(f"Successfully loaded {record_count} original turbine dataset records.")
            self.logger.info("Using 'turbine_id' from the CSV itself.")
            return df
        except Exception as e:
            self.logger.error(f"Error loading original turbine data: {e}")
            raise

    def load_new_turbine_data(self, directory_path: str) -> DataFrame:
        """
        Loads new turbine data from CSV with schema inference.
        Uses max_existing_id to assign a unique turbine_id.
        Assumes filenames follow a pattern like 'Location<number>.csv' (case-insensitive).
        """
        try:
            self.logger.info(f"Loading new turbine data from {directory_path} (with inferSchema)")
            df = (self.spark.read
                  .format("csv")
                  .option("header", "true")
                  .option("inferSchema", "true")
                  .option("cloudFiles.inferColumnTypes", "true")  # Ensure metadata is captured
                  .load(f"{directory_path}/Location*.csv")
                  .withColumn("_metadata.file_path", col("_metadata.file_path"))  # Capture file path
                  .cache())

            record_count = df.count()
            self.logger.info(f"Successfully loaded {record_count} new turbine dataset records.")

            # Get the maximum turbine_id from the original dataset.
            df_original_turbine_data = self.spark.read.table("bronze_data.original_turbine_bronze")
            max_existing_id = df_original_turbine_data.agg({"turbine_id": "max"}).collect()[0][0]
            if max_existing_id is None:
                max_existing_id = 0

            self.logger.info(f"Using maximum existing turbine_id: {max_existing_id}")

            # Extract turbine number from file metadata column
            df = df.withColumn(
                "extracted_turbine_id",
                regexp_extract(col("_metadata.file_path"), "Location(\\d+)", 1)
            )

            # Convert to integer, add the offset, and overwrite turbine_id column.
            df = df.withColumn(
                "turbine_id",
                col("extracted_turbine_id").cast("int") + lit(max_existing_id)
            ).drop("extracted_turbine_id")

            self.logger.info("Successfully added turbine_id to new turbine dataset records.")
            return df
        except Exception as e:
            self.logger.error(f"Error loading new turbine data: {e}")
            raise

    def write_bronze(self, df: DataFrame, table_name: str):
        """Writes the given DataFrame to a Bronze Delta table."""
        try:
            self.logger.info(f"Writing data to Delta table: bronze_data.{table_name}")
            self.spark.sql("CREATE SCHEMA IF NOT EXISTS bronze_data")
            df.write.mode("overwrite").format("delta").saveAsTable(f"bronze_data.{table_name}")
            self.logger.info(f"Successfully written data to bronze_data.{table_name}")
        except Exception as e:
            self.logger.error(f"Error writing bronze data: {e}")
            raise
