In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/wt_logger"

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, to_timestamp, when

class DataCleaner:
    def __init__(self, spark):
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()
        self.logger.info("DataCleaner initialized.")

    def clean_turbine_data(self, df: DataFrame) -> DataFrame:
        """ 
        Cleans turbine dataset:
        - Drops rows with missing essential values
        - Replaces negative power outputs with nulls (to be removed)
        """
        self.logger.info("Cleaning turbine data...")

        df = (df
              .dropna(subset=["timestamp", "power_output", "wind_speed"])
              .withColumn("power_output", when(col("power_output") < 0, None).otherwise(col("power_output")))
              .dropna()
              )

        self.logger.info(f"Cleaned dataset contains {df.count()} records after removing invalid data.")

        return df

    def transform_turbine_data(self, df: DataFrame) -> DataFrame:
        """ 
        Transforms new turbine dataset:
        - Scales power from normalized (0-1) to MW (0-4.5)
        - Renames columns to match original dataset
        - Drops unnecessary columns
        """
        self.logger.info("Transforming turbine data before merging.")

        df = (df
              .withColumn("power_output", col("Power") * 4.5)
              .withColumn("timestamp", to_timestamp(col("Time"), "yyyy-MM-dd HH:mm:ss"))
              .withColumnRenamed("windspeed_100m", "wind_speed")
              .withColumnRenamed("winddirection_100m", "wind_direction")
              .drop("Time", "temperature_2m", "relativehumidity_2m", "dewpoint_2m",
                    "winddirection_10m", "windspeed_10m", "windgusts_10m", "Power")
              )

        self.logger.info("Turbine data transformation complete.")

        return df

    def merge_bronze_data(self, df_original_turbine_data: DataFrame, df_new_turbine_data: DataFrame) -> DataFrame:
        """ Merges original and new turbine datasets into a unified Silver dataset. """

        self.logger.info(f"Merging {df_original_turbine_data.count()} original turbine records with {df_new_turbine_data.count()} new turbine records.")

        # Merge datasets
        merged_df = df_original_turbine_data.unionByName(df_new_turbine_data, allowMissingColumns=False)

        self.logger.info(f"Total records after merging: {merged_df.count()}")

        # Apply cleaning after merging
        merged_df = self.clean_turbine_data(merged_df)

        return merged_df

    def save_silver_table(self, df: DataFrame, table_name: str):
        """ Saves the merged dataset to the Silver layer. """
        
        self.logger.info(f"Saving cleaned data to silver_data.{table_name}")
        self.spark.sql("CREATE SCHEMA IF NOT EXISTS silver_data")
        df.write.mode("overwrite").format("delta").saveAsTable(f"silver_data.{table_name}")
        self.logger.info(f"Successfully saved cleaned data to silver_data.{table_name}")
