In [1]:
import socket
import random
import json
from datetime import datetime
import time
import threading

host = "127.0.0.1"
port = 9999

server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind((host, port))
server_socket.listen(1)

def publish_message(client_socket):
    while True:
        random_value = round(random.uniform(60.0, 100.0), 2)
        quality = "Good" if random_value > 70 else "Fair"

        message = {
            "sensorId": "temp-sensor-001",
            "value": random_value,
            "quality": quality,
            "timestamp": datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        }

        message_str = json.dumps(message) + "\n"
        client_socket.sendall(message_str.encode('utf-8'))
        time.sleep(1)

def start_socket_server():
    print(f"Server started on {host}:{port}")
    while True:
        client_socket, client_address = server_socket.accept()
        print(f"Connection established with {client_address}")

        publish_thread = threading.Thread(target=publish_message, args=(client_socket,))
        publish_thread.daemon = True
        publish_thread.start()

socket_server_thread = threading.Thread(target=start_socket_server)
socket_server_thread.daemon = True
socket_server_thread.start()

print("Socket server is running in the background.")


Server started on 127.0.0.1:9999Socket server is running in the background.



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, to_timestamp
from pyspark.sql.types import StringType, StructType, StructField, DoubleType

spark = SparkSession.builder \
    .appName("SocketStreamExample") \
    .master("local") \
    .getOrCreate()

schema = StructType([
    StructField("sensorId", StringType(), True),
    StructField("value", DoubleType(), True),
    StructField("quality", StringType(), True),
    StructField("timestamp", StringType(), True)
])

streaming_df = spark.readStream \
    .format("socket") \
    .option("host", "127.0.0.1") \
    .option("port", 9999)  \
    .load()

parsed_df = streaming_df.select(from_json(col("value").cast("string"), schema).alias("data"))
parsed_df = parsed_df.select(
    col("data.sensorId"),
    col("data.value"),
    col("data.quality"),
    to_timestamp(col("data.timestamp"), "yyyy-MM-dd'T'HH:mm:ss'Z'").alias("timestamp")
)


In [3]:
def print_stream(streaming_df, output_mode="append"):

  def process_batch(df, epoch_id):
          df.show()

  query = streaming_df.writeStream \
      .foreachBatch(process_batch) \
      .outputMode(output_mode) \
      .start()


  try:
      query.awaitTermination()
  except KeyboardInterrupt:
      query.stop()

In [None]:
from pyspark.sql.functions import col, window, avg, when

threshold_value = 80.0

windowed_df = parsed_df \
    .groupBy(
        window(col("timestamp"), "5 seconds"), "sensorId"
    ) \
    .agg(
        avg("value").alias("average_value")
    ) \
    .withColumn(
        "anomaly_flag", when(col("average_value") > threshold_value, 1).otherwise(0)
    )

print_stream(windowed_df, "complete")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|{2025-04-10 23:50...|temp-sensor-001|            80.82|           1|
|{2025-04-10 23:51...|temp-sensor-001|            85.85|           1|
|{2025-04-10 23:52...|temp-sensor-001|            80.69|           1|
|{2025-04-10 23:53...|temp-sensor-001|87.94000000000001|           1|
|{2025-04-10 23:52...|temp-sensor-001|81.31800000000001|           1|
+--------------------+---------------+-----------------+------------+
only showing top 20 rows

+--------------------+---------------+-----------------+------------+
|              window|       sensorId|    average_value|anomaly_flag|
+--------------------+---------------+-----------------+------------+
|{2025-04-10 23:52...|temp-sensor-001|           75.748|           0|
|{2025-04-10 23:52...|temp-sensor-001|83.10999999999999|           1|
|{2025-04-10 23:53...|temp-sensor-001|80.59200000000001|           1|
|{2025-04-10 23:54...|temp-sensor-001|           78.024|           0|