<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/read_from_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/read_from_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read from API
- ...

# Setting up PySpark

In [None]:
%pip install pyspark



In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [84]:
schema

StructType([StructField('bearing', IntegerType(), True), StructField('block_id', StringType(), True), StructField('current_status', StringType(), True), StructField('id', StringType(), True), StructField('lat', FloatType(), True), StructField('line_id', StringType(), True), StructField('lon', FloatType(), True), StructField('pattern_id', StringType(), True), StructField('route_id', StringType(), True), StructField('schedule_relationship', StringType(), True), StructField('shift_id', StringType(), True), StructField('speed', FloatType(), True), StructField('stop_id', StringType(), True), StructField('timestamp', TimestampType(), True), StructField('trip_id', StringType(), True)])

# Get data from API - Vehicles

In [85]:
import requests
import json

def readFromAPI(url: str, schema: StructType):
  response = requests.get(url)
  rdd = sc.parallelize(response.json())
  df = spark.read.schema(schema).json(rdd)
  return df

In [89]:
from pyspark.sql.types import *
vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

vehicles = readFromAPI("https://api.carrismetropolitana.pt/vehicles", vehicle_schema)
print(vehicles.count())
vehicles.show()

1163
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|     71|20241111-64010025...| IN_TRANSIT_TO|44|12709|38.649673|   4701|-8.991487|  4701_0_2|  4701_0|            SCHEDULED|123230234560|3.8888888| 090117|2024-11-11 17:56:40|4701_0_2|1100|173...|
|     11|20241111-64010094...|    STOPPED_AT|44|12637| 38.73084|   4002|-8.972685|  4002_0_3|  4002_0|            SCHEDULED|121520234560|      0.0| 010066|2024-11-11 17:56:47|4002_0_3|1100|173...|
|     20|2

In [None]:
vehicles = readFromAPI("https://api.carrismetropolitana.pt/vehicles")
vehicles.count()

In [76]:
vehicles.show(20)

+-------+--------------------+--------------+--------+------------------+-------+------------------+----------+--------+---------------------+------------+------------------+-------+----------+--------------------+
|bearing|            block_id|current_status|      id|               lat|line_id|               lon|pattern_id|route_id|schedule_relationship|    shift_id|             speed|stop_id| timestamp|             trip_id|
+-------+--------------------+--------------+--------+------------------+-------+------------------+----------+--------+---------------------+------------+------------------+-------+----------+--------------------+
|    238|20241111-64010007...| IN_TRANSIT_TO|44|12665| 38.67307662963867|   4701|-8.973311424255371|  4701_0_1|  4701_0|            SCHEDULED|123410234560|15.277777777777777| 090177|1731346586|4701_0_1|1100|171...|
|     79|20241111-64010263...|   INCOMING_AT|44|12578| 38.63596725463867|   4322| -8.87992000579834|  4322_0_2|  4322_0|            SCHEDULE

### API - https://github.com/carrismetropolitana/api

### Exercises

- Create an ETL process to monitor vehicles from Carris Metropolitana
  - Read data from "vehicles" endpoint and writes into "/content/output/vehicles" as parquet
  - Create  timestmap column to datetime (hh24:mi:ss)

- Read data from "stops" endpoint and writes into "/content/output/stops" as parquet
- Convert timestmap column to datetime (hh24:mi:ss)

In [82]:
from pyspark.sql.functions import *
vehicles.filter(col("id") == lit("42|2345")).show()

+-------+--------+--------------+-------+----------------+-------+------------------+----------+--------+---------------------+--------+-----+-------+----------+--------------------+
|bearing|block_id|current_status|     id|             lat|line_id|               lon|pattern_id|route_id|schedule_relationship|shift_id|speed|stop_id| timestamp|             trip_id|
+-------+--------+--------------+-------+----------------+-------+------------------+----------+--------+---------------------+--------+-----+-------+----------+--------------------+
|      0| 1059-11|    STOPPED_AT|42|2345|38.7675666809082|   2727|-9.100337028503418|  2727_0_1|  2727_0|            SCHEDULED|    1064|  0.0| 060207|1731346571|2727_0_1|1|1|1735...|
+-------+--------+--------------+-------+----------------+-------+------------------+----------+--------+---------------------+--------+-----+-------+----------+--------------------+

