<a href="https://colab.research.google.com/github/vaniamv/dataprocessing/blob/main/spark_streaming/examples/example_2_rate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 2
- Reading data from "rate"
- Aggregating data by window time
- Checking results from query in memory

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# Write output in memory

In [2]:
import pyspark.sql.functions as F

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load() #gera 10 linhas por seg

# transform
transformed = stream1.withColumn("minute", F.minute("timestamp")) #cria coluna min
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count() #agrega tempo de seg

# write stream in memory
query = (agg.writeStream
.format('memory')
.queryName('my_query')
.outputMode('complete')
.start()
)

In [9]:
spark.sql("select * from my_query order by window desc").show(10,False) #10 linhas por segunto em 5 seg = 50 count, as que não são 50 serão completadas

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2024-11-23 15:46:15, 2024-11-23 15:46:20}|14   |
|{2024-11-23 15:46:10, 2024-11-23 15:46:15}|50   |
|{2024-11-23 15:46:05, 2024-11-23 15:46:10}|50   |
|{2024-11-23 15:46:00, 2024-11-23 15:46:05}|50   |
|{2024-11-23 15:45:55, 2024-11-23 15:46:00}|50   |
|{2024-11-23 15:45:50, 2024-11-23 15:45:55}|50   |
|{2024-11-23 15:45:45, 2024-11-23 15:45:50}|26   |
+------------------------------------------+-----+



In [11]:
query.lastProgress

{'id': '2544777a-2822-4527-9894-1cc9b85d1477',
 'runId': 'b6291df0-1cb1-4092-be94-0a0a04401775',
 'name': 'my_query',
 'timestamp': '2024-11-23T15:49:29.539Z',
 'batchId': 13,
 'numInputRows': 170,
 'inputRowsPerSecond': 10.298661174047373,
 'processedRowsPerSecond': 11.210762331838565,
 'durationMs': {'addBatch': 15052,
  'commitOffsets': 61,
  'getBatch': 0,
  'latestOffset': 0,
  'queryPlanning': 23,
  'triggerExecution': 15164,
  'walCommit': 27},
 'stateOperators': [{'operatorName': 'stateStoreSave',
   'numRowsTotal': 45,
   'numRowsUpdated': 4,
   'allUpdatesTimeMs': 293,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 0,
   'commitTimeMs': 9797,
   'memoryUsedBytes': 97136,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 200,
   'numStateStoreInstances': 200,
   'customMetrics': {'loadedMapCacheHitCount': 5200,
    'loadedMapCacheMissCount': 0,
    'stateOnCurrentVersionSizeBytes': 29968}}],
 'sources': [{'description': 'RateStreamV2[rowsPerSecond=10, rampUpTimeSe

In [12]:
query.stop()

# Write output as json

In [None]:
!rm -rf content/output

In [13]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame

def save_parquet(df, batch_id):
  (df
   .withColumn("batch_id",F.lit(batch_id))
   .withColumn("load_time",F.current_timestamp())
   .write.mode("append")
   .parquet("content/output/rate_parquet")
  )

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withWatermark("timestamp", "5 seconds").withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream as parquet with foreachBatch
query = (agg.writeStream
.option('checkpointLocation', 'content/output/checkpoint')
.trigger(processingTime='20 seconds') #tem de ser sempre superior a janela de agregação
.outputMode('append')
.foreachBatch(save_parquet) #chama o método com a lógica de escrita
.start()
) #não poderia ter format parquet, daria erro pk tem uma agregação!


In [17]:
result = spark.read.format("parquet").load("content/output/rate_parquet/")
result.sort(F.asc("window")).show(100, False)

+------------------------------------------+-----+--------+--------------------------+
|window                                    |count|batch_id|load_time                 |
+------------------------------------------+-----+--------+--------------------------+
|{2024-11-23 15:54:40, 2024-11-23 15:54:45}|16   |2       |2024-11-23 15:55:20.204062|
|{2024-11-23 15:54:45, 2024-11-23 15:54:50}|50   |2       |2024-11-23 15:55:20.204062|
|{2024-11-23 15:54:50, 2024-11-23 15:54:55}|50   |2       |2024-11-23 15:55:20.204062|
|{2024-11-23 15:54:55, 2024-11-23 15:55:00}|50   |3       |2024-11-23 15:55:40.299437|
|{2024-11-23 15:55:00, 2024-11-23 15:55:05}|50   |3       |2024-11-23 15:55:40.299437|
|{2024-11-23 15:55:05, 2024-11-23 15:55:10}|50   |3       |2024-11-23 15:55:40.299437|
|{2024-11-23 15:55:10, 2024-11-23 15:55:15}|50   |4       |2024-11-23 15:56:00.271563|
|{2024-11-23 15:55:15, 2024-11-23 15:55:20}|50   |4       |2024-11-23 15:56:00.271563|
|{2024-11-23 15:55:20, 2024-11-23 15:55:25}

In [None]:
#o mesmo batch id pode criar mais de uma linha

In [18]:
query.stop()

ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "<ipython-input-13-695a23616afa>", line 9, in save_parquet
    .parquet("content/output/rate_parquet")
  File "/usr/local/lib/python3.10/dist-packages/pyspark/sql/readwriter.py", line 1721, in parquet
    self._jwrite.parquet(path)
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
  File "/usr/local/lib/python3.10/dist-packages/pyspark/errors/exceptions/captured.py

# Enrich data with faker

In [20]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.9 MB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m1.3/1.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [None]:
!rm -rf content/output/events

In [21]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'name': F.lit(fake.name()),
      'address': F.lit(fake.address()),
      'email': F.lit(fake.email()),
      'dob': F.lit(fake.date_of_birth()),
      'phone': F.lit(fake.phone_number())
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/events")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load() #uma linha por segundo

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds') #pode acontecer de duplicar os dados desta forma
.foreachBatch(insert_into_table)
.start()
)

In [26]:
query.stop()


In [25]:
spark.read.parquet("content/output/events").show(100, False)

+-----------------------+-----+-----------------------------+----------------------------------------------------------+-----------------------------+----------+----------------------+
|timestamp              |value|name                         |address                                                   |email                        |dob       |phone                 |
+-----------------------+-----+-----------------------------+----------------------------------------------------------+-----------------------------+----------+----------------------+
|2024-11-23 16:03:26.669|56   |Matthew Peterson             |5142 Rodriguez Pines Suite 699\nSteelechester, VA 38115   |oneillkathleen@example.org   |1915-08-03|001-906-248-5806x4125 |
|2024-11-23 16:03:52.669|82   |Nicholas Davis               |396 Mclaughlin Islands Suite 914\nBakerchester, OH 32412  |powelljennifer@example.com   |1950-09-12|(271)982-7944x5133    |
|2024-11-23 16:02:34.669|4    |William Williams             |41259 James Gl

In [27]:
query.stop()