In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/wt_logger"

In [0]:
class DataTransformer:
    def __init__(self, spark):
        """ Initializes the DataTransformer class with a Spark session and logger. """
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()
        self.logger.info("DataTransformer initialized.")

    def compute_expected_power(self, df: DataFrame) -> DataFrame:
        """ 
        Compute expected power output using the wind power equation.

        Formula: P = 0.5 * ρ * A * V³ * Cp
        Where:
        - ρ = 1.2 kg/m³ (air density)
        - A = 5024 m² (swept area for a 100m turbine)
        - Cp = 0.45 (power coefficient)
        - V = wind speed (from dataset)
        
        Converts power from watts to MW (dividing by 1,000,000).
        """

        self.logger.info("Computing expected power output...")

        df = df.withColumn("expected_power",
                           0.5 * lit(1.2) * lit(5024) *
                           pow(col("wind_speed"), 3) * lit(0.45) / 1_000_000
                          )

        self.logger.info("Expected power computation complete.")
        return df

    def detect_record_anomalies(self, df: DataFrame) -> DataFrame:
        """
        Detects anomalies at the record level using Isolation Forest.

        - Takes wind speed, expected power, and actual power as input.
        - Converts to Pandas for processing.
        - Uses Isolation Forest to detect anomalies (contamination=5%).
        - Joins results back into PySpark DataFrame.

        If is_anomaly = 1, the record is considered an anomaly.
        """

        self.logger.info("Detecting anomalies using Isolation Forest...")

        # Adding unique ID per record
        df = df.withColumn("record_id", monotonically_increasing_id())

        # Convert to Pandas (Isolation Forest requires NumPy)
        pdf = df.select("record_id", "wind_speed", "expected_power", "power_output").toPandas()

        iso_forest = IsolationForest(
                    contamination=0.15,  
                    n_estimators=200,      
                    max_samples='auto',
                    random_state=42
                )

        pdf["anomaly_score"] = iso_forest.fit_predict(
            pdf[["wind_speed", "expected_power", "power_output"]]
        )

        # Convert -1 (anomaly) to 1, and 1 (normal) to 0
        pdf["anomaly_score"] = pdf["anomaly_score"].apply(lambda x: 1 if x == -1 else 0)

        # Create a Pandas DF with just the ID and the anomaly result
        anomaly_pdf = pdf[["record_id", "anomaly_score"]]

        # Convert that to a Spark DataFrame
        spark_pdf = spark.createDataFrame(anomaly_pdf)

        # Join on record_id, not on power_output
        df = df.join(spark_pdf, on="record_id", how="left").withColumnRenamed("anomaly_score", "is_anomaly")


        self.logger.info("Anomaly detection complete.")
        return df

    def detect_turbine_anomalies(self, df: DataFrame) -> DataFrame:
        """
        Detects turbines with an unusually high number of anomalies.

        - Groups by turbine_id and calculates the anomaly rate.
        - Classifies turbines based on their anomaly rates:
          > 60% anomalies → FAULTY_SENSOR (Exclude)
          30-60% anomalies → REVIEW_REQUIRED (Flag for review)
          < 30% anomalies → NORMAL (Keep)

        Returns a DataFrame with turbine_id and its anomaly classification.
        """

        self.logger.info("Detecting turbines with high anomaly rates...")

        turbine_anomaly_df = (df.groupBy("turbine_id")
                               .agg(spark_sum("is_anomaly").alias("total_anomalies"),
                                    count("*").alias("total_records"))
                               .withColumn("anomaly_rate", col("total_anomalies") / col("total_records"))
                              )

        turbine_anomaly_df = turbine_anomaly_df.withColumn(
            "turbine_status",
            when(col("anomaly_rate") > 0.6, "FAULTY_SENSOR")
            .when(col("anomaly_rate") > 0.3, "REVIEW_REQUIRED")
            .otherwise("NORMAL")
        )

        self.logger.info("Turbine anomaly detection complete.")
        return turbine_anomaly_df

    def apply_smart_filtering(self, df: DataFrame, turbine_anomaly_df: DataFrame) -> DataFrame:
        """
        Applies filtering rules based on turbine anomaly classification:

        - FAULTY_SENSOR turbines are removed.
        - REVIEW_REQUIRED turbines are flagged for review.
        - NORMAL turbines remain unchanged.

        Ensures the final dataset is clean and reliable.
        """

        self.logger.info("Applying smart filtering...")

        df = df.join(turbine_anomaly_df.select("turbine_id", "turbine_status"), on="turbine_id", how="left")

        df = df.filter(col("turbine_status") != "FAULTY_SENSOR")

        self.logger.info("Smart filtering complete.")
        return df

    def save_gold_table(self, df: DataFrame, table_name: str = "gold_turbine_analysis"):
        """
        Saves the final turbine dataset to the Gold layer.
        
        - Ensures the gold_data schema exists.
        - Saves the dataset as a Delta Table.
        """

        self.logger.info(f"Saving results to gold_data.{table_name}")
        self.spark.sql("CREATE SCHEMA IF NOT EXISTS gold_data")
        df.write.mode("overwrite").format("delta").saveAsTable(f"gold_data.{table_name}")
        self.logger.info(f"Successfully saved to gold_data.{table_name}")