<a href="https://colab.research.google.com/github/vaniamv/dataprocessing/blob/main/spark_streaming/examples/1-read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [None]:
%pip install pyspark



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [None]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [None]:
type(stream)

In [None]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [None]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [None]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [None]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

# Transfor streaming dataframe

In [None]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [None]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [None]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append')
  .start()
)

# Checking result table

In [None]:
# StreamingQuery
type(query)

In [None]:
print(spark.table("rate_report").count())
spark.table("rate_report").show(20, False)

# one line per second

304
+----------------------+-----+------+
|timestamp             |value|value2|
+----------------------+-----+------+
|2024-11-22 15:04:15.04|0    |0     |
|2024-11-22 15:04:16.04|1    |2     |
|2024-11-22 15:04:17.04|2    |4     |
|2024-11-22 15:04:18.04|3    |6     |
|2024-11-22 15:04:19.04|4    |8     |
|2024-11-22 15:04:20.04|5    |10    |
|2024-11-22 15:04:21.04|6    |12    |
|2024-11-22 15:04:22.04|7    |14    |
|2024-11-22 15:04:23.04|8    |16    |
|2024-11-22 15:04:24.04|9    |18    |
|2024-11-22 15:04:25.04|10   |20    |
|2024-11-22 15:04:26.04|11   |22    |
|2024-11-22 15:04:27.04|12   |24    |
|2024-11-22 15:04:28.04|13   |26    |
|2024-11-22 15:04:29.04|14   |28    |
|2024-11-22 15:04:30.04|15   |30    |
|2024-11-22 15:04:31.04|16   |32    |
|2024-11-22 15:04:32.04|17   |34    |
|2024-11-22 15:04:33.04|18   |36    |
|2024-11-22 15:04:34.04|19   |38    |
+----------------------+-----+------+
only showing top 20 rows



In [None]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [None]:
query.isActive

True

In [None]:
query.recentProgress

[{'id': 'de7ef201-4f96-4a74-a300-f6b9e03fa6e0',
  'runId': '1881ae5a-7787-4ef3-a4f5-57926f6161b3',
  'name': 'rate_report',
  'timestamp': '2024-11-22T15:06:44.044Z',
  'batchId': 149,
  'numInputRows': 1,
  'inputRowsPerSecond': 100.0,
  'processedRowsPerSecond': 12.658227848101266,
  'durationMs': {'addBatch': 22,
   'commitOffsets': 28,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 5,
   'triggerExecution': 79,
   'walCommit': 24},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': 148,
    'endOffset': 149,
    'latestOffset': 149,
    'numInputRows': 1,
    'inputRowsPerSecond': 100.0,
    'processedRowsPerSecond': 12.658227848101266}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 1}},
 {'id': 'de7ef201-4f96-4a74-a300-f6b9e03fa6e0',
  'runId': '1881ae5a-7787-4ef3-a4f5-57926f6161b3',
  'name': 'rate_report',
  'timestamp': '2024-11-22T15:06:45.047Z',
  'batchI

In [None]:
query.lastProgress['batchId']

304

# Stop streaming

In [None]:
query.stop()

In [None]:
# awaitTermination


# Increase rows per second (rate)


In [None]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [None]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

740
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-22 15:16:27.485|0    |0     |
|2024-11-22 15:16:27.535|1    |2     |
|2024-11-22 15:16:27.585|2    |4     |
|2024-11-22 15:16:27.635|3    |6     |
|2024-11-22 15:16:27.685|4    |8     |
|2024-11-22 15:16:27.735|5    |10    |
|2024-11-22 15:16:27.785|6    |12    |
|2024-11-22 15:16:27.835|7    |14    |
|2024-11-22 15:16:27.885|8    |16    |
|2024-11-22 15:16:27.935|9    |18    |
|2024-11-22 15:16:27.985|10   |20    |
|2024-11-22 15:16:28.035|11   |22    |
|2024-11-22 15:16:28.085|12   |24    |
|2024-11-22 15:16:28.135|13   |26    |
|2024-11-22 15:16:28.185|14   |28    |
|2024-11-22 15:16:28.235|15   |30    |
|2024-11-22 15:16:28.285|16   |32    |
|2024-11-22 15:16:28.335|17   |34    |
|2024-11-22 15:16:28.385|18   |36    |
|2024-11-22 15:16:28.435|19   |38    |
|2024-11-22 15:16:28.485|20   |40    |
|2024-11-22 15:16:28.535|21   |42    |
|2024-11-22 15:16:28.

In [None]:
query.lastProgress['sources'][0]['numInputRows']

20

In [None]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

In [None]:
query.stop()