In [0]:
dbutils.fs.cp("file:/Workspace/Shared/energy_consumption_data.csv","dbfs/FileStore/energy_consumption_data.csv")

True

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("EnergyConsumptionPrediction").getOrCreate()

# Load the data from a CSV file
df = spark.read.csv("/dbfs/FileStore/energy_consumption_data.csv", header=True, inferSchema=True)

# Preview the data
df.show()

+----------------+-----------+-----------+-----------+---------------+
|       timestamp|temperature|time_of_day|device_type|energy_consumed|
+----------------+-----------+-----------+-----------+---------------+
|10-09-2024 08:00|       22.5|          8|      meter|            100|
|10-09-2024 09:00|       23.0|          9|     sensor|            110|
|10-09-2024 10:00|       24.1|         10|      meter|            120|
|10-09-2024 11:00|       25.3|         11|     sensor|            115|
|10-09-2024 12:00|       26.4|         12|  appliance|            130|
|10-09-2024 13:00|       27.0|         13|      meter|            125|
+----------------+-----------+-----------+-----------+---------------+



In [0]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

# Index the 'device_type' column (if it's categorical)
device_type_indexer = StringIndexer(inputCol="device_type", outputCol="device_type_index")

# Assemble feature columns
feature_columns = ['temperature', 'time_of_day', 'device_type_index']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Create a Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="energy_consumed")

# Create a pipeline
pipeline = Pipeline(stages=[device_type_indexer, assembler, lr])

# Split the data into training and test sets
train_df, test_df = df.randomSplit([0.8, 0.2], seed=123)

# Fit the model on the training data
pipeline_model = pipeline.fit(train_df)

# Test the model on the test data
predictions = pipeline_model.transform(test_df)
predictions.select("features", "energy_consumed", "prediction").show()


+---------------+---------------+------------------+
|       features|energy_consumed|        prediction|
+---------------+---------------+------------------+
|[25.3,11.0,2.0]|            115|123.41059602649246|
+---------------+---------------+------------------+



In [0]:

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="energy_consumed", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 8.410596026492456


In [0]:
# Print R-squared value
r2_evaluator = RegressionEvaluator(labelCol="energy_consumed", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)
print(f"R-squared: {r2}")

R-squared: -inf


In [0]:
dbutils.fs.cp("file:/Workspace/Shared/future_energy_data.csv","dbfs/FileStore/future_energy_data.csv")

True

In [0]:
# Load new data for prediction (future data)
new_data = spark.read.csv("/dbfs/FileStore/future_energy_data.csv", header=True, inferSchema=True)

# Transform the new data using the same pipeline
new_data_transformed = pipeline_model.transform(new_data)

# Apply the pre-trained model to the future data
predictions = pipeline_model.transform(new_data)

# Display the predictions
predictions.select("timestamp", "features", "prediction").show()

# Save the predictions to a new CSV file
predictions.select("timestamp", "prediction").write.csv("/dbfs/FileStore/future_energy_predictions.csv", header=True)

print("Predictions saved to CSV file at /dbfs/FileStore/future_energy_predictions.csv")

+----------------+---------------+------------------+
|       timestamp|       features|        prediction|
+----------------+---------------+------------------+
|11-09-2024 08:00| [24.3,8.0,0.0]| 109.3832781457157|
|11-09-2024 09:00| [25.7,9.0,2.0]|120.17384105962915|
|11-09-2024 10:00|[26.5,10.0,1.0]|123.09188741724259|
|11-09-2024 11:00|[27.8,11.0,0.0]|127.64486754969568|
|11-09-2024 12:00|[29.0,12.0,2.0]|137.78145695367326|
|11-09-2024 13:00|[28.4,13.0,1.0]| 136.1216887417357|
|11-09-2024 14:00|[22.7,14.0,0.0]|117.78559602643394|
|11-09-2024 15:00|[23.6,15.0,2.0]|126.94122516550773|
|11-09-2024 16:00|[25.0,16.0,1.0]|131.82119205292872|
|11-09-2024 17:00|[27.1,17.0,0.0]|138.99006622512522|
|11-09-2024 18:00|[24.9,18.0,2.0]| 138.0091059601933|
|11-09-2024 19:00|[28.3,19.0,1.0]|149.42880794697282|
|11-09-2024 20:00|[26.7,20.0,0.0]|144.49917218535603|
|11-09-2024 21:00|[23.4,21.0,2.0]| 139.9213576157769|
|11-09-2024 22:00|[24.5,22.0,1.0]|143.82036423829413|
|11-09-2024 23:00|[29.2,23.0

In [0]:
# Load the predictions CSV file
predicted_df = spark.read.csv("/dbfs/FileStore/future_energy_predictions.csv", header=True, inferSchema=True)

# Display the predictions
predicted_df.show()


+----------------+------------------+
|       timestamp|        prediction|
+----------------+------------------+
|11-09-2024 08:00| 109.3832781457157|
|11-09-2024 09:00|120.17384105962915|
|11-09-2024 10:00|123.09188741724259|
|11-09-2024 11:00|127.64486754969568|
|11-09-2024 12:00|137.78145695367326|
|11-09-2024 13:00| 136.1216887417357|
|11-09-2024 14:00|117.78559602643394|
|11-09-2024 15:00|126.94122516550773|
|11-09-2024 16:00|131.82119205292872|
|11-09-2024 17:00|138.99006622512522|
|11-09-2024 18:00| 138.0091059601933|
|11-09-2024 19:00|149.42880794697282|
|11-09-2024 20:00|144.49917218535603|
|11-09-2024 21:00| 139.9213576157769|
|11-09-2024 22:00|143.82036423829413|
|11-09-2024 23:00|159.49089403965672|
|12-09-2024 00:00| 87.29718543054037|
|12-09-2024 01:00|100.35182119215953|
|12-09-2024 02:00|106.53973509945226|
|12-09-2024 03:00| 97.05711920535424|
+----------------+------------------+



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("time_of_day", IntegerType(), True),
    StructField("device_type", StringType(), True)
])

streaming_df = (spark
    .readStream
    .option("header", "true")
    .schema(schema)  # Use the defined schema
    .csv("/dbfs/FileStore/future_energy_predictions.csv")) 

processed_stream = streaming_df.select("timestamp", "temperature", "time_of_day", "device_type")

query = (processed_stream
    .writeStream
    .outputMode("append")
    .format("console")
    .start())

# query.awaitTermination()


In [0]:
output_query = (processed_stream
    .writeStream
    .outputMode("append")
    .format("csv")  # Write to CSV
    .option("path", "/dbfs/FileStore/")
    .option("checkpointLocation", "/dbfs/FileStore/")
    .start())

# output_query.awaitTermination()