In [0]:
import sys
import os

# Dynamically set up the project root & src path
project_root = os.getcwd()
src_path = os.path.join(project_root, "src")

if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import the DataTransformer class
from src.wt_transformations import DataTransformer

# Read the Silver table
df_silver = spark.table("hive_metastore.silver_data.wind_turbine_silver")

# Initialize DataTransformer
transformer = DataTransformer(spark)

# Apply transformations:
#    - compute_expected_power
#    - detect_zscore_anomalies
#    - detect_record_anomalies (Isolation Forest)
#    - combine_anomalies
df_transformed = (
    df_silver
    .transform(transformer.compute_expected_power)
    .transform(transformer.detect_zscore_anomalies)
    .transform(transformer.detect_record_anomalies)
    .transform(transformer.combine_anomalies)
)

# Detect turbine-level anomalies and save to a "gold_turbine_stats" table
df_turbine_anomalies = transformer.detect_turbine_anomalies(df_transformed)
transformer.save_turbine_analysis(df_turbine_anomalies, "gold_turbine_stats")

# Apply smart filtering (removing FAULTY_SENSOR turbines) and save to "gold_turbine_analysis"
df_filtered = transformer.apply_smart_filtering(df_transformed, df_turbine_anomalies)
transformer.save_turbine_analysis(df_filtered, "gold_turbine_analysis")

# Calculate summary statistics (24-hour window) and save to "gold_turbine_summary"
df_summary = transformer.calculate_summary_statistics(df_filtered)
transformer.save_summary_table(df_summary, "gold_turbine_summary")

print("Data transformation & anomaly detection pipeline completed successfully!")
