In [0]:
# from pyspark.sql.functions import mean, stddev, min as spark_min, max as spark_max

# class DataTransformer:
#     def __init__(self, spark):
#         self.spark = spark

#     def summarize_power_output(self, df):
#         return (df.groupBy("turbine_id")
#                   .agg(
#                       spark_min("power_output").alias("min_power"),
#                       spark_max("power_output").alias("max_power"),
#                       mean("power_output").alias("avg_power"),
#                       stddev("power_output").alias("stddev_power")
#                   ))

#     def identify_anomalies(self, summary_df):
#         # We assume the "expected" power output = the average, 
#         # and anomalies are >2 std dev away from mean
#         from pyspark.sql.functions import col
#         # We might join the summary stats back to the original or handle it directly here
#         # For demonstration, let's just produce a DF of anomalies from the summary.
#         return (summary_df
#                 .withColumn("is_anomaly", 
#                     (col("max_power") - col("avg_power") > 2*col("stddev_power")) |
#                     (col("min_power") - col("avg_power") < -2*col("stddev_power"))
#                 )
#         )


In [0]:
%run "/Users/ovidiumtoma@gmail.com/wind_turbine_project/src/wt_logger"

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import mean, stddev, min as spark_min, max as spark_max, col

class DataTransformer:
    def __init__(self, spark):
        """ Initializes the DataTransformer class with a Spark session and logger. """
        self.spark = spark
        self.logger = LoggerUtility.setup_logging()

        self.logger.info("DataTransformer initialized.")

    def summarize_power_output(self, df: DataFrame) -> DataFrame:
        """ 
        Calculates summary statistics for power output per turbine. 
        Produces min, max, average, and standard deviation of power output.
        """
        self.logger.info("Summarizing power output...")

        summary_df = (df.groupBy("turbine_id")
                        .agg(
                            spark_min("power_output").alias("min_power"),
                            spark_max("power_output").alias("max_power"),
                            mean("power_output").alias("avg_power"),
                            stddev("power_output").alias("stddev_power")
                        ))

        self.logger.info(f"Summary statistics calculated. Row count: {summary_df.count()}")
        return summary_df

    def identify_anomalies(self, summary_df: DataFrame) -> DataFrame:
        """
        Identifies anomalies where power output deviates more than 2 standard deviations from the mean.
        Returns a DataFrame with an "is_anomaly" column indicating anomalies.
        """
        self.logger.info("Identifying anomalies...")

        anomalies_df = (summary_df
                        .withColumn("is_anomaly", 
                                    (col("max_power") - col("avg_power") > 2 * col("stddev_power")) |
                                    (col("min_power") - col("avg_power") < -2 * col("stddev_power"))
                        ))

        anomaly_count = anomalies_df.filter(col("is_anomaly") == True).count()
        self.logger.info(f"Anomaly detection complete. Anomalies found: {anomaly_count}")
        return anomalies_df

    def save_gold_table(self, df: DataFrame, table_name: str):
        """
        Saves the transformed (summary/anomaly) data as a Delta table in the `gold_data` schema.
        """
        self.logger.info(f"Saving transformed data to gold_data.{table_name}...")

        try:
            # Ensure the schema exists
            self.spark.sql("CREATE SCHEMA IF NOT EXISTS gold_data")

            # Save the DataFrame as a Delta table
            df.write.mode("overwrite").format("delta").saveAsTable(f"gold_data.{table_name}")

            self.logger.info(f"Successfully saved transformed data to gold_data.{table_name}")

        except Exception as e:
            self.logger.error(f"Error saving transformed data to gold_data.{table_name}: {str(e)}", exc_info=True)
            raise

    def transform_and_save(self, df: DataFrame):
        """
        Runs the full transformation pipeline:
        1. Summarizes power output.
        2. Identifies anomalies.
        3. Saves results to Gold tables.
        """
        self.logger.info("Starting transformation process...")

        summary_df = self.summarize_power_output(df)
        anomalies_df = self.identify_anomalies(summary_df)

        # Save both summary and anomalies
        self.save_gold_table(summary_df, "turbine_summary")
        self.save_gold_table(anomalies_df, "turbine_anomalies")

        self.logger.info("Transformation process complete.")
        return summary_df, anomalies_df
