In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=a31b1328b576d2e3fff103e234b0dbec670a5683eaa1c209b1950fb499dd9b74
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, to_timestamp

# Create Spark session
spark = SparkSession.builder \
    .appName("EnergyConsumptionMonitoring") \
    .getOrCreate()

In [None]:
# Load CSV data
csv_file_path = '/content/energy_data.csv'
energy_data = spark.read.csv(csv_file_path, header=True, inferSchema=True)


energy_data.show(truncate=False)


+-------+---------+-------------------+---------------+-----------+
|user_id|device_id|reading_time       |energy_consumed|device_type|
+-------+---------+-------------------+---------------+-----------+
|101    |abc123   |2023-09-01 08:15:00|15.2           |smart_meter|
|102    |def456   |2023-09-01 09:30:00|20.5           |smart_meter|
|101    |abc123   |2023-09-02 08:10:00|14.8           |smart_meter|
|103    |ghi789   |2023-09-02 07:50:00|18.3           |thermostat |
|104    |jkl012   |2023-09-03 10:00:00|22.0           |solar_panel|
|102    |def456   |2023-09-03 11:30:00|19.6           |smart_meter|
+-------+---------+-------------------+---------------+-----------+



In [None]:
# Convert reading_time to TimestampType
energy_data = energy_data.withColumn("reading_time", to_timestamp(col("reading_time")))

# Calculate the overall average energy consumption
overall_average_consumption = energy_data.agg(avg("energy_consumed").alias("overall_average")).first()['overall_average']
print("Overall Average Energy Consumption:", overall_average_consumption)

# Detect anomalies: identify readings that deviate significantly from the overall average
# Check for values greater than 1.1 times the average or less than 0.9 times the average
anomalies = energy_data.filter((col("energy_consumed") > overall_average_consumption * 1.1) |
                                (col("energy_consumed") < overall_average_consumption * 0.9))

anomalies.show(truncate=False)

# save the anomalies to a new CSV file
output_file_path = 'anomalies_output.csv'
anomalies.write.csv(output_file_path, header=True)

print("Anomalies detected and saved to:", output_file_path)


anomalies_df = spark.read.csv('anomalies_output.csv', header=True, inferSchema=True)
anomalies_df.show(truncate=False)


Overall Average Energy Consumption: 18.400000000000002
+-------+---------+-------------------+---------------+-----------+
|user_id|device_id|reading_time       |energy_consumed|device_type|
+-------+---------+-------------------+---------------+-----------+
|101    |abc123   |2023-09-01 08:15:00|15.2           |smart_meter|
|102    |def456   |2023-09-01 09:30:00|20.5           |smart_meter|
|101    |abc123   |2023-09-02 08:10:00|14.8           |smart_meter|
|104    |jkl012   |2023-09-03 10:00:00|22.0           |solar_panel|
+-------+---------+-------------------+---------------+-----------+

Anomalies detected and saved to: anomalies_output.csv
+-------+---------+-------------------+---------------+-----------+
|user_id|device_id|reading_time       |energy_consumed|device_type|
+-------+---------+-------------------+---------------+-----------+
|101    |abc123   |2023-09-01 08:15:00|15.2           |smart_meter|
|102    |def456   |2023-09-01 09:30:00|20.5           |smart_meter|
|101  