In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/logger"

In [0]:
import os

logger = LoggerUtility.setup_logging()

class RawDataIngestor:
    def __init__(self, spark):
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()

        # Performance optimization: Adjust based on data size.
        self.spark.conf.set("spark.sql.shuffle.partitions", "4")

        partition_count = self.spark.conf.get("spark.sql.shuffle.partitions")
        self.logger.info(f"RawDataIngestor initialized with shuffle partitions set to {partition_count}.")

    def read_all_csvs(self, directory_path: str):
        """ Read all CSVs in one batch """
        self.logger.info(f"Attempting to read all CSV files from: {directory_path}")

        try:
            df = (self.spark.read.format("csv")
                    .option("header", "true")
                    .option("inferSchema", "true")
                    
                    # Took the decision to handle all files together in order to improve performance.
                    # IDs are unique among groups, so there should be no issue.
                    .load(f"{directory_path}/*.csv"))
            
            self.logger.info(f"Successfully read CSV files from {directory_path}. Record count: {df.count()}")
            return df

        except Exception as e:
            self.logger.error(f"Error reading CSV files from {directory_path}: {str(e)}", exc_info=True)
            raise

    def write_bronze(self, df, table_name: str):
        """ Writes to Delta format for speed optimization """
        self.logger.info(f"Writing data to Delta table: bronze_data.{table_name}")

        try:
            self.spark.sql("CREATE SCHEMA IF NOT EXISTS bronze_data")
            (df.write
               .mode("append")
               .format("delta")
               .saveAsTable(f"bronze_data.{table_name}"))

            self.logger.info(f"Successfully written data to bronze_data.{table_name}")

        except Exception as e:
            self.logger.error(f"Error writing data to bronze_data.{table_name}: {str(e)}", exc_info=True)
            raise

    def process_all_files(self, directory_path: str):
        """ Reads all CSVs at once, then writes them as one table """
        self.logger.info(f"Starting data ingestion process for directory: {directory_path}")

        try:
            df = self.read_all_csvs(directory_path)
            table_name = "wind_turbine_bronze"
            self.write_bronze(df, table_name)

            self.logger.info(f"Successfully processed and stored data in bronze_data.{table_name}")

        except Exception as e:
            self.logger.error(f"Data ingestion process failed: {str(e)}", exc_info=True)
            raise
