In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, col, abs

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("WindTurbineDataPipeline").getOrCreate()

# Read the CSV file into a DataFrame
df = spark.read.csv('/raw_data/data_group_1.csv', header=True, inferSchema=True)

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:
# Data Cleaning
df_clean = df.na.drop()  # Drops rows with missing values
# Assuming wind_speed and power_output are the columns of interest
stats = df_clean.select(
    mean(col('power_output')).alias('mean'),
    stddev(col('power_output')).alias('stddev')
).collect()

mean_power = stats[0]['mean']
stddev_power = stats[0]['stddev']

# Anomaly Detection
df_anomalies = df_clean.withColumn('z_score', (col('power_output') - mean_power) / stddev_power)
df_anomalies = df_anomalies.filter(abs(col('z_score')) > 2)

# Summary Statistics
df_summary = df_clean.groupBy('turbine_id').agg(
    mean(col('power_output')).alias('average_power'),
    min(col('power_output')).alias('min_power'),
    max(col('power_output')).alias('max_power')
)

# Store Processed Data
# Replace `your_table` with your actual table name and configure the database settings
df_clean.write.format('jdbc').option('url', 'jdbc:postgresql://dbserver').option('dbtable', 'your_table').save()

# Store Summary Statistics
df_summary.write.format('jdbc').option('url', 'jdbc:postgresql://dbserver').option('dbtable', 'summary_table').save()

# Close Spark Session
spark.stop()
