The first piece of code starts a socket server in the background and publishes random sensor data on port 9999.

In [1]:
import socket
import random
import json
from datetime import datetime
import time
import threading

host = "127.0.0.1"
port = 9999

server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind((host, port))
server_socket.listen(1)

def publish_message(client_socket):
    while True:
        random_value = round(random.uniform(60.0, 100.0), 2)
        quality = "Good" if random_value > 70 else "Fair"

        message = {
            "sensorId": "temp-sensor-001",
            "value": random_value,
            "quality": quality,
            "timestamp": datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        }

        message_str = json.dumps(message) + "\n"
        client_socket.sendall(message_str.encode('utf-8'))
        time.sleep(1)

def start_socket_server():
    print(f"Server started on {host}:{port}")
    while True:
        client_socket, client_address = server_socket.accept()
        print(f"Connection established with {client_address}")

        publish_thread = threading.Thread(target=publish_message, args=(client_socket,))
        publish_thread.daemon = True
        publish_thread.start()

socket_server_thread = threading.Thread(target=start_socket_server)
socket_server_thread.daemon = True
socket_server_thread.start()

print("Socket server is running in the background.")


Server started on 127.0.0.1:9999Socket server is running in the background.



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StringType, StructType, StructField, DoubleType

spark = SparkSession.builder \
    .appName("SocketStreamExample") \
    .master("local") \
    .getOrCreate()

schema = StructType([
    StructField("sensorId", StringType(), True),
    StructField("value", DoubleType(), True),
    StructField("quality", StringType(), True),
    StructField("timestamp", StringType(), True)
])

streaming_df = spark.readStream \
    .format("socket") \
    .option("host", "127.0.0.1") \
    .option("port", 9999)  \
    .load()

parsed_df = streaming_df.select(from_json(col("value").cast("string"), schema).alias("data"))
parsed_df = parsed_df.select("data.sensorId", "data.value", "data.quality", "data.timestamp")


In [3]:
def print_stream(streaming_df, output_mode="append"):

  def process_batch(df, epoch_id):
          df.show()

  query = streaming_df.writeStream \
      .foreachBatch(process_batch) \
      .outputMode(output_mode) \
      .start()


  try:
      query.awaitTermination()
  except KeyboardInterrupt:
      query.stop()

In [4]:
print_stream(parsed_df)

Connection established with ('127.0.0.1', 33430)
+--------+-----+-------+---------+
|sensorId|value|quality|timestamp|
+--------+-----+-------+---------+
+--------+-----+-------+---------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|92.25|   Good|2025-04-10T23:42:03Z|
|temp-sensor-001|94.78|   Good|2025-04-10T23:42:04Z|
|temp-sensor-001| 87.0|   Good|2025-04-10T23:42:05Z|
|temp-sensor-001|64.35|   Fair|2025-04-10T23:42:06Z|
|temp-sensor-001|68.61|   Fair|2025-04-10T23:42:07Z|
+---------------+-----+-------+--------------------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|77.85|   Good|2025-04-10T23:42:08Z|
|temp-sensor-001|65.21|   Fair|2025-04-10T23:42:09Z|
|temp-sensor-001|73.97|   Good|2025-04-10T23:42:10Z|
+-------------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/us

In [None]:
selected_columns_df = parsed_df.select("sensorId", "value")

print_stream(selected_columns_df)

Connection established with ('127.0.0.1', 44374)
+--------+-----+
|sensorId|value|
+--------+-----+
+--------+-----+

+---------------+-----+
|       sensorId|value|
+---------------+-----+
|temp-sensor-001|70.78|
+---------------+-----+

+---------------+-----+
|       sensorId|value|
+---------------+-----+
|temp-sensor-001|70.77|
+---------------+-----+

+---------------+-----+
|       sensorId|value|
+---------------+-----+
|temp-sensor-001|63.43|
+---------------+-----+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


In [5]:
with_column_df = parsed_df.withColumn("value_celsius", (col("value") - 32) * 5 / 9)

print_stream(with_column_df)

Connection established with ('127.0.0.1', 40492)
+--------+-----+-------+---------+-------------+
|sensorId|value|quality|timestamp|value_celsius|
+--------+-----+-------+---------+-------------+
+--------+-----+-------+---------+-------------+

+---------------+-----+-------+--------------------+-----------------+
|       sensorId|value|quality|           timestamp|    value_celsius|
+---------------+-----+-------+--------------------+-----------------+
|temp-sensor-001|99.37|   Good|2025-04-10T23:42:27Z|37.42777777777778|
+---------------+-----+-------+--------------------+-----------------+

+---------------+-----+-------+--------------------+----------------+
|       sensorId|value|quality|           timestamp|   value_celsius|
+---------------+-----+-------+--------------------+----------------+
|temp-sensor-001|91.29|   Good|2025-04-10T23:42:28Z|32.9388888888889|
+---------------+-----+-------+--------------------+----------------+

+---------------+-----+-------+----------------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


In [None]:
filtered_df = parsed_df.filter(col("value") > 70.0)

print_stream(filtered_df)


Connection established with ('127.0.0.1', 44720)
+--------+-----+-------+---------+
|sensorId|value|quality|timestamp|
+--------+-----+-------+---------+
+--------+-----+-------+---------+

+--------+-----+-------+---------+
|sensorId|value|quality|timestamp|
+--------+-----+-------+---------+
+--------+-----+-------+---------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|80.95|   Good|2024-11-13T01:40:20Z|
+---------------+-----+-------+--------------------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|95.16|   Good|2024-11-13T01:40:21Z|
+---------------+-----+-------+--------------------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


In [None]:
aggregated_df = parsed_df.groupBy("sensorId").agg({"value": "avg"})
print_stream(aggregated_df, "complete")

Connection established with ('127.0.0.1', 56934)
+--------+----------+
|sensorId|avg(value)|
+--------+----------+
+--------+----------+

+---------------+----------+
|       sensorId|avg(value)|
+---------------+----------+
|temp-sensor-001|   81.3376|
+---------------+----------+

+---------------+-----------------+
|       sensorId|       avg(value)|
+---------------+-----------------+
|temp-sensor-001|81.36638297872341|
+---------------+-----------------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py", line 117, in call
    self.func(Da

In [None]:
sorted_df = aggregated_df.orderBy(col("avg(value)").desc())
print_stream(sorted_df, "complete")

Connection established with ('127.0.0.1', 47786)
+--------+----------+
|sensorId|avg(value)|
+--------+----------+
+--------+----------+

+---------------+-----------------+
|       sensorId|       avg(value)|
+---------------+-----------------+
|temp-sensor-001|82.89699999999998|
+---------------+-----------------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


In [None]:
from pyspark.sql.functions import window

windowed_df = parsed_df \
    .groupBy("sensorId", window(col("timestamp"), "5 seconds")) \
    .agg({"value": "max"}) \
    .select("sensorId", "window.start", "window.end", col("max(value)").alias("max_value"))

print_stream(windowed_df, "complete")

Connection established with ('127.0.0.1', 48854)
+--------+-----+---+---------+
|sensorId|start|end|max_value|
+--------+-----+---+---------+
+--------+-----+---+---------+

+---------------+-------------------+-------------------+---------+
|       sensorId|              start|                end|max_value|
+---------------+-------------------+-------------------+---------+
|temp-sensor-001|2025-04-10 20:28:40|2025-04-10 20:28:45|    87.12|
|temp-sensor-001|2025-04-10 20:28:30|2025-04-10 20:28:35|    98.73|
|temp-sensor-001|2025-04-10 20:28:25|2025-04-10 20:28:30|    94.74|
|temp-sensor-001|2025-04-10 20:28:35|2025-04-10 20:28:40|    79.14|
|temp-sensor-001|2025-04-10 20:28:20|2025-04-10 20:28:25|    85.04|
+---------------+-------------------+-------------------+---------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/us

In [None]:
repartitioned_df = parsed_df.repartition(3)
print_stream(repartitioned_df)

Connection established with ('127.0.0.1', 60816)
+--------+-----+-------+---------+
|sensorId|value|quality|timestamp|
+--------+-----+-------+---------+
+--------+-----+-------+---------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|93.83|   Good|2024-11-12T17:08:55Z|
+---------------+-----+-------+--------------------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|86.41|   Good|2024-11-12T17:08:56Z|
+---------------+-----+-------+--------------------+

+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|71.83|   Good|2024-11-12T17:08:57Z|
+---------------+-----+-------+--------------------+

+-----------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


+---------------+-----+-------+--------------------+
|       sensorId|value|quality|           timestamp|
+---------------+-----+-------+--------------------+
|temp-sensor-001|61.76|   Fair|2024-11-12T17:08:59Z|
+---------------+-----+-------+--------------------+

