In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/wt_logger"

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

class DataCleaner:
    def __init__(self, spark):
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()

        self.logger.info("DataCleaner initialized.")

    def remove_missing_values(self, df: DataFrame) -> DataFrame:
        """ Removes rows with null values in critical columns. """
        self.logger.info(f"Rows before starting cleanup process: {df.count()}")
        self.logger.info("Removing missing values...")
        df_cleaned = df.dropna(subset=["turbine_id", "timestamp", "power_output"])
        self.logger.info(f"Rows after removing missing values: {df_cleaned.count()}")
        return df_cleaned

    def handle_outliers(self, df: DataFrame) -> DataFrame:
        """
        Handles outliers by filtering values within a reasonable range.
        Example: Removing power_output < 0 or > 500.
        """
        self.logger.info("Handling outliers...")
        df_filtered = df.filter((col("power_output") >= 0) & (col("power_output") <= 500))
        self.logger.info(f"Rows after outlier removal: {df_filtered.count()}")
        return df_filtered


    def save_silver_table(self, df: DataFrame, table_name: str):
        """ Saves cleaned data as a Delta table in the `silver_data` schema. """
        self.logger.info(f"Saving cleaned data to silver_data.{table_name}...")

        try:
            # Ensure the schema exists
            self.spark.sql("CREATE SCHEMA IF NOT EXISTS silver_data")

            # Save the cleaned DataFrame as a Delta table
            df.write.mode("append").format("delta").saveAsTable(f"silver_data.{table_name}")

            self.logger.info(f"Successfully saved cleaned data to silver_data.{table_name}")

        except Exception as e:
            self.logger.error(f"Error saving cleaned data to silver_data.{table_name}: {str(e)}", exc_info=True)
            raise

    def clean_data(self, df: DataFrame, table_name: str):
        """ Full cleaning pipeline + save to Silver table. """
        self.logger.info("Starting full cleaning process...")
        df_no_missing = self.remove_missing_values(df)
        df_cleaned = self.handle_outliers(df_no_missing)

        self.save_silver_table(df_cleaned, table_name)

        self.logger.info("Cleaning process complete.")
        return df_cleaned
