<a href="https://colab.research.google.com/github/vaniamv/dataprocessing/blob/main/spark_streaming/examples/example_3_api_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

In [3]:
!rm -rf /content/landing
!rm -rf /content/bronze
!mkdir -p /content/landing

## Simulate producer:
- extract data from API
- store data as json in the lake
- run task async - não precisa de esperar

In [4]:
import requests
from pyspark.sql.types import *
import json
import datetime
import asyncio

async def ingest_from_api(url: str, table: str, schema: StructType = None):
  response = requests.get(url)
  timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  if response.status_code == 200:
    data = response.json()
    with open(f"/content/landing/{table}_{int(timestamp)}.json", "w") as f:
        json.dump(data, f)

async def producer(loop: int, interval_time: int):
  for i in range(loop):
    await ingest_from_api("https://api.carrismetropolitana.pt/vehicles", "vehicles")
    await ingest_from_api("https://api.carrismetropolitana.pt/lines", "lines")
    await asyncio.sleep(interval_time)

async def main():
  asyncio.create_task(producer(10, 30)) #chama 10 vezes com 30 seg de intervalo

await main()

In [15]:
!ls /content/landing | wc #word-count

     20      20     550


- Read from /content/landing as streaming
- store data in memory (for testing)
- store data in the bronze layer

In [7]:
from pyspark.sql.types import *

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

stream = spark.readStream.format("json").schema(vehicle_schema).load("/content/landing/vehicles*")

dedup = stream.dropDuplicates()

In [14]:
dedup.isStreaming

True

In [8]:
# using memory for testing
try:
  if query.isActive:
    query.stop()
except:
  pass

query = (dedup.writeStream.format("memory").option("queryName", "vehicles").start())

In [16]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [22]:
query.status

{'message': 'Getting offsets from FileStreamSource[file:/content/landing/vehicles*]',
 'isDataAvailable': False,
 'isTriggerActive': True}

In [23]:
query.lastProgress

{'id': '3ab564c0-2711-46b7-ab67-3820e2dfd5f2',
 'runId': 'dc9600d4-348d-4ebe-9839-3cc5334b5524',
 'name': 'vehicles',
 'timestamp': '2024-11-30T10:11:47.359Z',
 'batchId': 4,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 4, 'triggerExecution': 4},
 'stateOperators': [{'operatorName': 'dedupe',
   'numRowsTotal': 3780,
   'numRowsUpdated': 0,
   'allUpdatesTimeMs': 3531,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 2,
   'commitTimeMs': 6443,
   'memoryUsedBytes': 1875984,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 200,
   'numStateStoreInstances': 200,
   'customMetrics': {'loadedMapCacheHitCount': 600,
    'loadedMapCacheMissCount': 0,
    'numDroppedDuplicateRows': 54,
    'stateOnCurrentVersionSizeBytes': 1652176}}],
 'sources': [{'description': 'FileStreamSource[file:/content/landing/vehicles*]',
   'startOffset': {'logOffset': 3},
   'endOffset': {'logOffset': 3},
   'latestOffset': None,
   'n

In [24]:
query.stop()

In [13]:
spark.sql("select * from vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    132|20241130-64020054...| IN_TRANSIT_TO|44|12068| 38.71411|   4203|-8.966258|  4203_0_2|  4203_0|            SCHEDULED|121160000007| 6.388889| 100105|2024-11-30 09:47:27|4203_0_2|3000|093...|
|     63|             4514-21|    STOPPED_AT| 42|2377|38.786182|   2822|-9.183305|  2822_0_2|  2822_0|            SCHEDULED|        4514|      0.0| 110085|2024-11-30 09:47:33|2822_0_2|2|1|0930...|
|    306|      

In [None]:
!rm -rf /content/bronze

In [25]:
from pyspark.sql.functions import *

# watermark is necessary because of the aggregation
transformed = stream.withWatermark("timestamp", "60 seconds")
agg = (transformed
       .groupBy(window(transformed.timestamp, "5 minutes"), col("current_status"))
       .agg(min(transformed.timestamp).alias("init_timestamp"), count("*").alias("count")))

def insert_vehicles(df, batch_id):
  #df2 = df.groupBy("window").pivot("current_status").sum("count")
  df.write.format("parquet").mode("append").save("/content/bronze/vehicles")

# using memory for testing
query2 = (agg
          .writeStream
          .outputMode("append")
          .foreachBatch(insert_vehicles)
          .option("checkpointLocation", "/content/bronze/checkpoint")
          .trigger(processingTime='20 seconds')
          .start())

In [50]:
query2.status

{'message': 'Waiting for next trigger',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [49]:
query2.isActive

True

In [46]:
spark.read.format("parquet").load("/content/bronze/vehicles/*").show(100, False)

+------------------------------------------+--------------+-------------------+-----+
|window                                    |current_status|init_timestamp     |count|
+------------------------------------------+--------------+-------------------+-----+
|{2024-11-30 09:40:00, 2024-11-30 09:45:00}|IN_TRANSIT_TO |2024-11-30 09:44:41|3    |
|{2024-11-30 09:40:00, 2024-11-30 09:45:00}|INCOMING_AT   |2024-11-30 09:44:59|1    |
|{2024-11-30 09:40:00, 2024-11-30 09:45:00}|STOPPED_AT    |2024-11-30 09:44:41|3    |
+------------------------------------------+--------------+-------------------+-----+



In [None]:
spark.read.format("parquet").load("/content/bronze/vehicles/*").show(100, False)

+------------------------------------------+--------------+-------------------+-----+
|window                                    |current_status|init_timestamp     |count|
+------------------------------------------+--------------+-------------------+-----+
|{2024-11-29 00:25:00, 2024-11-29 00:30:00}|IN_TRANSIT_TO |2024-11-29 00:29:47|5    |
|{2024-11-29 00:10:00, 2024-11-29 00:15:00}|IN_TRANSIT_TO |2024-11-29 00:14:31|18   |
|{2024-11-29 00:15:00, 2024-11-29 00:20:00}|IN_TRANSIT_TO |2024-11-29 00:15:02|916  |
|{2024-11-29 00:20:00, 2024-11-29 00:25:00}|IN_TRANSIT_TO |2024-11-29 00:20:00|2    |
|{2024-11-29 00:15:00, 2024-11-29 00:20:00}|INCOMING_AT   |2024-11-29 00:15:01|308  |
|{2024-11-29 00:25:00, 2024-11-29 00:30:00}|INCOMING_AT   |2024-11-29 00:29:13|6    |
|{2024-11-29 00:10:00, 2024-11-29 00:15:00}|INCOMING_AT   |2024-11-29 00:14:29|4    |
|{2024-11-29 00:20:00, 2024-11-29 00:25:00}|INCOMING_AT   |2024-11-29 00:20:00|2    |
|{2024-11-29 00:15:00, 2024-11-29 00:20:00}|STOPPED_AT

## Report
- show vehicles by status in 5-min window time
- one line per window time

In [48]:
def pivot_data(df: DataFrame):
  result = df.orderBy("init_timestamp").groupBy("window").pivot("current_status").sum("count")
  result.show(100, False)

df = spark.read.format("parquet").load("/content/bronze/vehicles/*")
pivot_data(df)

+------------------------------------------+-----------+-------------+----------+
|window                                    |INCOMING_AT|IN_TRANSIT_TO|STOPPED_AT|
+------------------------------------------+-----------+-------------+----------+
|{2024-11-30 09:40:00, 2024-11-30 09:45:00}|1          |3            |3         |
+------------------------------------------+-----------+-------------+----------+



In [51]:
query2.stop()