In [0]:
dbutils.fs.cp("file:/Workspace/Shared/environment_data.csv","dbfs/FileStore/environment_data.csv")

True

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

# Initialize Spark session
spark = SparkSession.builder.appName("EnvironmentalPrediction").getOrCreate()

# Load the environmental data
env_df = spark.read.csv("/dbfs/FileStore/environment_data.csv", header=True, inferSchema=True)

# Feature engineering: prepare features for the model
assembler = VectorAssembler(inputCols=[ 'PM25', 'PM10', 'CO2', 'Temperature', 'Humidity', 'WindSpeed'], outputCol="features", handleInvalid="skip")
env_df = assembler.transform(env_df)

train_data, test_data = env_df.randomSplit([0.8, 0.2])
# Train a decision tree model to predict air quality levels
dt = DecisionTreeRegressor(featuresCol="features", labelCol="PM25")
model = dt.fit(env_df)

# Make predictions on test data
predictions = model.transform(test_data)

# Show predictions
predictions.select("features", "PM25", "prediction").show()

+--------------------+----+------------------+
|            features|PM25|        prediction|
+--------------------+----+------------------+
|[15.2,20.1,410.0,...|15.2|15.200000000000003|
+--------------------+----+------------------+



In [0]:
# Save the trained model
model.save("/dbfs/FileStore/environment_data")

In [0]:
from pyspark.ml.regression import DecisionTreeRegressionModel

loaded_model = DecisionTreeRegressionModel.load("/dbfs/FileStore/environmental_model")

new_data = spark.read.csv("dbfs:/FileStore/new_environment_data.csv", header=True, inferSchema=True)
new_data = assembler.transform(new_data)

future_predictions = loaded_model.transform(new_data)
future_predictions.select("features", "prediction").show()


+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[40.1,55.0,420.0,...|      23.5|
|[42.3,52.5,425.0,...|      23.5|
|[43.0,56.0,430.0,...|      23.5|
|[45.2,60.0,440.0,...|      23.5|
|[46.5,61.5,445.0,...|      23.5|
|[48.0,63.0,450.0,...|      23.5|
|[49.5,65.5,455.0,...|      23.5|
|[50.0,67.0,460.0,...|      23.5|
+--------------------+----------+



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
from pyspark.sql.functions import col
# Define the schema for the CSV data
schema = StructType([
    StructField("DateTime", StringType(), True),
    StructField("PM25", DoubleType(), True),
    StructField("PM10", DoubleType(), True),
    StructField("CO2", DoubleType(), True),
    StructField("Temperature", DoubleType(), True),
    StructField("Humidity", DoubleType(), True),
    StructField("WindSpeed", DoubleType(), True)
])


streaming_df = (spark
    .readStream
    .option("header", "true")
    .schema(schema)  # Use the defined schema
    .csv("dbfs:/FileStore/")
  ) 
processed_stream = streaming_df.select("DateTime", "PM25", "Temperature", "Humidity", "WindSpeed")

query = (processed_stream
    .writeStream
    .outputMode("append")
    .format("console")  
    .start())

anomalies = streaming_df.filter(
    (col("PM25") > 45) | (col("Temperature") > 35)
)

anomalies.show()
# query.awaitTermination()
