# Ingest pit_stops.json file

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql import functions as f

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), False),
                                      StructField("milliseconds", IntegerType(), False),
                                     ])

In [0]:
df = spark.read.schema(pit_stops_schema).option("multiline", True).json(f"{raw_folder_path}/pit_stops.json")

In [0]:
df.show(5)

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
+------+--------+----+---+--------+--------+------------+
only showing top 5 rows



### Step 2 - Rename columns and add new columns

In [0]:
transformed_df = df.withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumn("env", f.lit(env)).withColumn("ingestion_date", f.current_timestamp())

In [0]:
transformed_df.show(5)

+-------+---------+----+---+--------+--------+------------+----+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds| env|      ingestion_date|
+-------+---------+----+---+--------+--------+------------+----+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|Test|2023-06-03 22:30:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|Test|2023-06-03 22:30:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|Test|2023-06-03 22:30:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|Test|2023-06-03 22:30:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|Test|2023-06-03 22:30:...|
+-------+---------+----+---+--------+--------+------------+----+--------------------+
only showing top 5 rows



### Step 3 - Write the output to processed container in parquet format

In [0]:
# transformed_df \
# .write \
# .mode("overwrite") \
# .parquet(f"{processed_folder_path}/pit_stops.parquet")

transformed_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.pit_stops")