# Ingest pit_stops.json file

### Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql import functions as f

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), False),
                                      StructField("milliseconds", IntegerType(), False),
                                     ])

In [0]:
df = spark.read.schema(pit_stops_schema).option("multiline", True).json("/FileStore/tables/pit_stops.json")

In [0]:
df.show(5)

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
+------+--------+----+---+--------+--------+------------+
only showing top 5 rows



### Step 2 - Rename columns and add new columns

In [0]:
transformed_df = df.withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumn("ingestion_date", f.current_timestamp())

In [0]:
transformed_df.show(5)

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|      ingestion_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2023-06-02 16:24:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2023-06-02 16:24:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2023-06-02 16:24:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2023-06-02 16:24:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2023-06-02 16:24:...|
+-------+---------+----+---+--------+--------+------------+--------------------+
only showing top 5 rows



### Step 3 - Write the output to processed container in parquet format

In [0]:
transformed_df \
.write \
.mode("overwrite") \
.parquet("/FileStore/tables/pit_stops.parquet")