<a href="https://colab.research.google.com/github/vaniamv/dataprocessing/blob/main/spark_streaming/examples/1-read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [3]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [4]:
type(stream)

In [5]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [6]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [7]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [8]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
rate

# Transfor streaming dataframe

In [10]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [11]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [12]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append')
  .start()
)

# Checking result table

In [13]:
# StreamingQuery
type(query)

In [16]:
print(spark.table("rate_report").count())
spark.table("rate_report").show(20, False)

# one line per second

26
+----------------------+-----+------+
|timestamp             |value|value2|
+----------------------+-----+------+
|2024-11-23 14:12:59.84|0    |0     |
|2024-11-23 14:13:00.84|1    |2     |
|2024-11-23 14:13:01.84|2    |4     |
|2024-11-23 14:13:02.84|3    |6     |
|2024-11-23 14:13:03.84|4    |8     |
|2024-11-23 14:13:04.84|5    |10    |
|2024-11-23 14:13:05.84|6    |12    |
|2024-11-23 14:13:06.84|7    |14    |
|2024-11-23 14:13:07.84|8    |16    |
|2024-11-23 14:13:08.84|9    |18    |
|2024-11-23 14:13:09.84|10   |20    |
|2024-11-23 14:13:10.84|11   |22    |
|2024-11-23 14:13:11.84|12   |24    |
|2024-11-23 14:13:12.84|13   |26    |
|2024-11-23 14:13:13.84|14   |28    |
|2024-11-23 14:13:14.84|15   |30    |
|2024-11-23 14:13:15.84|16   |32    |
|2024-11-23 14:13:16.84|17   |34    |
|2024-11-23 14:13:17.84|18   |36    |
|2024-11-23 14:13:18.84|19   |38    |
+----------------------+-----+------+
only showing top 20 rows



In [17]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [18]:
query.isActive

True

In [19]:
query.recentProgress

[{'id': '5876aac2-5324-41d4-8b65-02a5bafb56c4',
  'runId': '388fd28e-3663-47bc-b696-12a290f4097e',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:12:59.973Z',
  'batchId': 0,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'addBatch': 1461,
   'commitOffsets': 72,
   'getBatch': 4,
   'latestOffset': 0,
   'queryPlanning': 84,
   'triggerExecution': 1717,
   'walCommit': 61},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': None,
    'endOffset': 0,
    'latestOffset': 0,
    'numInputRows': 0,
    'inputRowsPerSecond': 0.0,
    'processedRowsPerSecond': 0.0}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 0}},
 {'id': '5876aac2-5324-41d4-8b65-02a5bafb56c4',
  'runId': '388fd28e-3663-47bc-b696-12a290f4097e',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:13:01.747Z',
  'batchId': 1,
  'numInputRows': 1,
  'inp

In [20]:
query.lastProgress['batchId']

46

# Stop streaming

In [21]:
query.stop()

In [26]:
spark.sql("select * from rate_report").count()

50

In [None]:
# awaitTermination


# Increase rows per second (rate)


In [27]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()  #para cada segundo cria 20 linhas

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [28]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

880
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-23 14:17:53.948|0    |0     |
|2024-11-23 14:17:53.998|1    |2     |
|2024-11-23 14:17:54.048|2    |4     |
|2024-11-23 14:17:54.098|3    |6     |
|2024-11-23 14:17:54.148|4    |8     |
|2024-11-23 14:17:54.198|5    |10    |
|2024-11-23 14:17:54.248|6    |12    |
|2024-11-23 14:17:54.298|7    |14    |
|2024-11-23 14:17:54.348|8    |16    |
|2024-11-23 14:17:54.398|9    |18    |
|2024-11-23 14:17:54.448|10   |20    |
|2024-11-23 14:17:54.498|11   |22    |
|2024-11-23 14:17:54.548|12   |24    |
|2024-11-23 14:17:54.598|13   |26    |
|2024-11-23 14:17:54.648|14   |28    |
|2024-11-23 14:17:54.698|15   |30    |
|2024-11-23 14:17:54.748|16   |32    |
|2024-11-23 14:17:54.798|17   |34    |
|2024-11-23 14:17:54.848|18   |36    |
|2024-11-23 14:17:54.898|19   |38    |
|2024-11-23 14:17:54.948|20   |40    |
|2024-11-23 14:17:54.998|21   |42    |
|2024-11-23 14:17:55.

In [30]:
query.lastProgress['sources'][0]

{'description': 'RateStreamV2[rowsPerSecond=20, rampUpTimeSeconds=0, numPartitions=default',
 'startOffset': 400,
 'endOffset': 401,
 'latestOffset': 401,
 'numInputRows': 20,
 'inputRowsPerSecond': 2000.0,
 'processedRowsPerSecond': 246.91358024691357}

In [29]:
query.lastProgress['sources'][0]['numInputRows']

20

In [31]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

timestamp - 2024-11-23T14:23:13.958Z
batchId - 320
numInputRows - 20
--
timestamp - 2024-11-23T14:23:14.954Z
batchId - 321
numInputRows - 20
--
timestamp - 2024-11-23T14:23:15.951Z
batchId - 322
numInputRows - 20
--
timestamp - 2024-11-23T14:23:16.953Z
batchId - 323
numInputRows - 20
--
timestamp - 2024-11-23T14:23:17.955Z
batchId - 324
numInputRows - 20
--
timestamp - 2024-11-23T14:23:18.958Z
batchId - 325
numInputRows - 20
--
timestamp - 2024-11-23T14:23:19.956Z
batchId - 326
numInputRows - 20
--
timestamp - 2024-11-23T14:23:20.958Z
batchId - 327
numInputRows - 20
--
timestamp - 2024-11-23T14:23:21.950Z
batchId - 328
numInputRows - 20
--
timestamp - 2024-11-23T14:23:22.954Z
batchId - 329
numInputRows - 20
--
timestamp - 2024-11-23T14:23:23.953Z
batchId - 330
numInputRows - 20
--
timestamp - 2024-11-23T14:23:24.950Z
batchId - 331
numInputRows - 20
--
timestamp - 2024-11-23T14:23:25.950Z
batchId - 332
numInputRows - 20
--
timestamp - 2024-11-23T14:23:26.954Z
batchId - 333
numInputRows 

In [32]:
query.stop()