In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d36c22c07189f3759fd6d46e7821b692736431e09ca2418e8a3df7ac54ad899a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Task 1: Set Up Spark Streaming

In [4]:
# Create a Spark session for streaming environmental data
spark = SparkSession.builder \
    .appName("EnvironmentalMonitoring") \
    .getOrCreate()

In [5]:
# Set log level to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

In [18]:
# Read data from the CSV file (replace real-time streaming with batch data processing)
# Load environmental data from the CSV file (replace the file path with the actual path to your CSV file)
environment_stream = spark.read.option("header", "true").csv("environmental_data_anomalies.csv")


In [19]:
# Convert columns to appropriate data types
environment_stream = environment_stream.withColumn("pm25", col("pm25").cast("float")) \
    .withColumn("pm10", col("pm10").cast("float")) \
    .withColumn("co2", col("co2").cast("float")) \
    .withColumn("temperature", col("temperature").cast("float")) \
    .withColumn("humidity", col("humidity").cast("float")) \
    .withColumn("wind_speed", col("wind_speed").cast("float"))

# Task 2: Anomaly Detection

In [20]:
# Detect anomalies: Identify pollution spikes or extreme weather conditions
anomalies = environment_stream.filter(
    (col("pm25") > 150) |  # PM2.5 pollution spike
    (col("temperature") > 40) |  # Extremely high temperature
    (col("temperature") < 0) |  # Extremely low temperature
    (col("humidity") > 90) |  # Extremely high humidity
    (col("wind_speed") > 20)  # High wind speed
)

In [21]:
anomalies.show()

+---------+-----+-----+-----+-----------+--------+----------+--------------------+
|sensor_id| pm25| pm10|  co2|temperature|humidity|wind_speed|           timestamp|
+---------+-----+-----+-----+-----------+--------+----------+--------------------+
|        2|162.0| 80.2|450.0|       42.0|    55.0|       6.0|2024-09-01 12:05:...|
|        5|140.2|100.0|420.0|       28.0|    90.5|       8.0|2024-09-01 12:20:...|
|        6| 85.0|120.0|480.0|       35.5|    60.0|      22.5|2024-09-01 12:25:...|
|        7| 10.5| 15.0|350.0|       40.5|    40.0|      15.0| 2024-09-01 12:30:00|
|        8|  5.0| 10.0|340.0|       38.0|    95.0|      10.0|2024-09-01 12:35:...|
|        9| 12.0| 20.0|360.0|       -5.0|    30.0|       5.0|2024-09-01 12:40:...|
|       10|155.0| 75.0|410.0|       45.0|    85.0|      25.0|2024-09-01 12:45:...|
+---------+-----+-----+-----+-----------+--------+----------+--------------------+

