<a href="https://colab.research.google.com/github/vaniamv/dataprocessing/blob/main/spark/challenges/challenge_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 1
##  Implement INGESTION process
- Set up path in the "lake"
  - !mkdir -p /content/lake/bronze

- Read data from API https://api.carrismetropolitana.pt/
  - Endpoints:
    - vehicles
    - lines
    - municipalities
  - Use StructFields to enforce schema

- Transformations
  - vehicles
    - create "date" extracted from "timestamp" column (format: hh24miss)

- Write data as PARQUET into the BRONZE layer (/content/lake/bronze)
  - Partition "vehicles" by "date" column
  - Paths:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
!mkdir -p /content/lake/bronze

In [22]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_vehicles(self):
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)
      #self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles")
      self.write_partitioned_parquet(df=df, path="/content/lake/bronze/vehicles", partition_col="timestamp")


    def ingestion_lines(self):
      lines_schema = StructType([StructField("id", StringType(), True),
                                 StructField("short_name", StringType(), True),
                                 StructField("long_name", StringType(), True),
                                 StructField("municipalities", ArrayType(StringType()), True),
                                 StructField("localities", ArrayType(StringType()), True),
                                 StructField("routes", ArrayType(StringType()), True),
                                 StructField("patterns", ArrayType(StringType()), True),
                                 StructField("facilities", ArrayType(StringType()), True)])
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)
      self.load(df=df, format="parquet", path="/content/lake/bronze/lines")


    def ingestion_municipalities(self):
        municipalities_schema = StructType([StructField("district_id", StringType(), True),
                                            StructField("district_name", StringType(), True),
                                            StructField("id", StringType(), True),
                                            StructField("name", StringType(), True),
                                            StructField("prefix", StringType(), True),
                                            StructField("region_id", StringType(), True),
                                            StructField("region_name", StringType(), True)])
        df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)
        self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")

    def cleansing_vehicles(self):
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

      # transformations challenge2
      #df = df.withColumn("date",date_format('timestamp',"HHmmss"))
      df = df.withColumnRenamed("lat", "latitude")\
                  .withColumnRenamed("lon", "longitude")
      df = df.drop_duplicates()
      df = df.dropna(subset=['CURRENT_STATUS'])
      #self.write_partitioned_parquet(df=df, path="/content/lake/silver/vehicles", partition_col="timestamp")

    def write_partitioned_parquet(self, df: DataFrame, path: str, partition_col: str):

        df = df.withColumn(partition_col, date_format(partition_col, "HHmmss"))

        self.spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
        (df
         .coalesce(1) #make sure that there is only one parquet created
         .write
         .mode("overwrite")
         .partitionBy(partition_col)
         .format("parquet")
         .save(path))

    def enrich(self):
        pass

In [23]:
if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion lines")
    etl.ingestion_lines()

    print("Running Task - Ingestion municipalities")
    etl.ingestion_municipalities()

    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()

    #etl.enrich()

    print("ETL program completed")

Starting ETL program
Running Task - Ingestion Vehicles
Running Task - Ingestion lines
Running Task - Ingestion municipalities
Running Task - Cleansing Vehicles
ETL program completed


In [14]:
spark.read.parquet("/content/lake/bronze/vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+--------------------+---------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|             trip_id|timestamp|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+--------------------+---------+
|    330|20241122-64010050...| IN_TRANSIT_TO|44|12502|38.655266|   4600|-8.991152|  4600_0_2|  4600_0|            SCHEDULED|121930234560|      7.5| 090060|4600_0_2|2200|200...|   203708|
|    331|20241122-64010148...| IN_TRANSIT_TO|44|12741|38.530884|   4730|-8.885605|  4730_0_2|  4730_0|            SCHEDULED|113350234560|2.2222223| 160213|4730_0_2|2200|203...|   203708|
|    112|           1_1216-11| IN_TRANSIT_TO| 41|1281| 38.77073| 

In [15]:
spark.read.parquet("/content/lake/silver/vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+----------+----------+--------+---------------------+------------------+---------+-------+--------------------+---------+
|bearing|            block_id|current_status|      id| latitude|line_id| longitude|pattern_id|route_id|schedule_relationship|          shift_id|    speed|stop_id|             trip_id|timestamp|
+-------+--------------------+--------------+--------+---------+-------+----------+----------+--------+---------------------+------------------+---------+-------+--------------------+---------+
|     12|       ESC_DU_EU2016| IN_TRANSIT_TO| 43|2119| 38.64266|   3716| -9.200345|  3716_0_2|  3716_0|            SCHEDULED|            EU2196|11.944445| 020322|3716_0_2_2000_202...|   203714|
|      0|       ESC_DU_EU1010|    STOPPED_AT| 43|2380|38.609585|   3521| -9.125027|  3521_1_2|  3521_1|            SCHEDULED|            EU1141|0.2777778| 140201|3521_1_2_2130_215...|   220234|
|    165|20241122-64010300...|

In [None]:
spark.read.parquet("/content/lake/bronze/lines").show()

+----+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  id|short_name|           long_name|      municipalities|          localities|              routes|            patterns|facilities|
+----+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|2740|      2740|Ericeira (Termina...|        [1109, 1106]|[Mafra, Antas, Al...|[2740_0, 2740_1, ...|[2740_0_1, 2740_0...|        []|
|2741|      2741|Ericeira (Termina...|  [1109, 1107, 1106]|[Mafra, Antas, Al...|    [2741_0, 2741_1]|[2741_0_1, 2741_0...|        []|
|2742|      2742|Lisboa (C. Grande...|  [1106, 1107, 1109]|[Campo Grande, Lo...|            [2742_0]|[2742_0_1, 2742_0_2]|        []|
|2743|      2743|Lisboa (C. Grande...|  [1106, 1107, 1109]|[Campo Grande, Lo...|            [2743_0]|[2743_0_1, 2743_0_2]|        []|
|2744|      2744|Lisboa (C. Grande...|  [1106, 1107, 1109]|[Ca

In [None]:
spark.read.parquet("/content/lake/bronze/municipalities").show()

+-----------+-------------+----+--------------------+------+---------+----------------+
|district_id|district_name|  id|                name|prefix|region_id|     region_name|
+-----------+-------------+----+--------------------+------+---------+----------------+
|         07|        Évora|0712|        Vendas Novas|    19|    PT187|Alentejo Central|
|         11|       Lisboa|1101|            Alenquer|    20|    PT16B|           Oeste|
|         11|       Lisboa|1102|   Arruda dos Vinhos|    20|    PT16B|           Oeste|
|         11|       Lisboa|1105|             Cascais|    05|    PT170|             AML|
|         11|       Lisboa|1106|              Lisboa|    06|    PT170|             AML|
|         11|       Lisboa|1107|              Loures|    07|    PT170|             AML|
|         11|       Lisboa|1109|               Mafra|    08|    PT170|             AML|
|         11|       Lisboa|1110|              Oeiras|    12|    PT170|             AML|
|         11|       Lisboa|1111|

In [5]:
!ls /content/lake/bronze/vehicles

spark.read.format("parquet").load("/content/lake/bronze/vehicles").count()

'timestamp=203553'  'timestamp=203617'	'timestamp=203634'  'timestamp=203651'	'timestamp=203708'
'timestamp=203554'  'timestamp=203618'	'timestamp=203635'  'timestamp=203652'	'timestamp=203709'
'timestamp=203555'  'timestamp=203619'	'timestamp=203636'  'timestamp=203653'	'timestamp=203710'
'timestamp=203600'  'timestamp=203620'	'timestamp=203637'  'timestamp=203654'	'timestamp=203711'
'timestamp=203601'  'timestamp=203621'	'timestamp=203638'  'timestamp=203655'	'timestamp=203712'
'timestamp=203603'  'timestamp=203622'	'timestamp=203639'  'timestamp=203656'	'timestamp=203713'
'timestamp=203604'  'timestamp=203623'	'timestamp=203640'  'timestamp=203657'	'timestamp=203714'
'timestamp=203605'  'timestamp=203624'	'timestamp=203641'  'timestamp=203658'	'timestamp=203715'
'timestamp=203607'  'timestamp=203625'	'timestamp=203642'  'timestamp=203659'	'timestamp=203716'
'timestamp=203609'  'timestamp=203626'	'timestamp=203643'  'timestamp=203700'	'timestamp=203717'
'timestamp=203610'  'timestamp

612

In [6]:
!ls /content/lake/bronze/vehicles/timestamp=203650

part-00000-5a8099e4-69be-4027-aadd-4d15cbce08f4.c000.snappy.parquet


# CHALLENGE 2
##  Implement CLEANSING process
- Set up path in the "lake"
  - !mkdir -p /content/lake/silver

- Read data from BRONZE layer as PARQUET:
    - vehicles - path: /content/lake/bronze/vehicles
    - lines - path: /content/lake/bronze/lines
    - municipalities - path: /content/lake/bronze/municipalities

- Transformations
  - vehicles
    - rename "lat" and "lon" to "latitude" and "longitude" respectively
    - remove possible duplicates
    - remove rows when the column CURRENT_STATUS is null
    - remove any corrupted record
  - lines
    - remove duplicates
    - remove rows when the column LONG_NAME is null
    - remove any corrupted record
  - municipalities
    - remove duplicates
    - remove rows when the columns NAME or DISTRICT_NAME are null
    - remove any corrupted record

- Write data as PARQUET into the SILVER layer (/content/lake/silver)
  - Partition "vehicles" by "date"(created in the ingestion)
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities

In [7]:
!mkdir -p /content/lake/silver

In [8]:
vehicles = spark.read.parquet("/content/lake/bronze/vehicles")
lines = spark.read.parquet("/content/lake/bronze/lines")
municipalities = spark.read.parquet("/content/lake/bronze/municipalities")

In [9]:
vehicles.show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+--------------------+---------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|             trip_id|timestamp|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+--------------------+---------+
|    330|20241122-64010050...| IN_TRANSIT_TO|44|12502|38.655266|   4600|-8.991152|  4600_0_2|  4600_0|            SCHEDULED|121930234560|      7.5| 090060|4600_0_2|2200|200...|   203708|
|    331|20241122-64010148...| IN_TRANSIT_TO|44|12741|38.530884|   4730|-8.885605|  4730_0_2|  4730_0|            SCHEDULED|113350234560|2.2222223| 160213|4730_0_2|2200|203...|   203708|
|    112|           1_1216-11| IN_TRANSIT_TO| 41|1281| 38.77073| 

In [11]:
vehicles = vehicles.withColumnRenamed("lat", "latitude")\
                  .withColumnRenamed("lon", "longitude")
vehicles.show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+--------------------+---------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|             trip_id|timestamp|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+--------------------+---------+
|    330|20241122-64010050...| IN_TRANSIT_TO|44|12502|38.655266|   4600|-8.991152|  4600_0_2|  4600_0|            SCHEDULED|121930234560|      7.5| 090060|4600_0_2|2200|200...|   203708|
|    331|20241122-64010148...| IN_TRANSIT_TO|44|12741|38.530884|   4730|-8.885605|  4730_0_2|  4730_0|            SCHEDULED|113350234560|2.2222223| 160213|4730_0_2|2200|203...|   203708|
|    112|           1_1216-11| IN_TRANSIT_TO| 41|1281| 38.77073| 