### Qualifying Load and Transform

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [0]:
qualifying_df = spark.read \
.schema(qualifying_schema) \
.option("multiLine", True) \
.json("/mnt/formula1dlvb/raw/qualifying")

In [0]:
from pyspark.sql.functions import current_timestamp


In [0]:
final_df = qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("constructorId", "constructor_id") \
.withColumn("ingestion_date", current_timestamp())

In [0]:
final_df.write.mode("overwrite").parquet("/mnt/formula1dl/processed/qualifying")

In [0]:
display(spark.read.parquet('/mnt/formula1dl/processed/qualifying'))

qualify_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,ingestion_date
1,18,1,1,22,1,1:26.572,1:25.187,1:26.714,2023-10-16T16:35:04.633+0000
2,18,9,2,4,2,1:26.103,1:25.315,1:26.869,2023-10-16T16:35:04.633+0000
3,18,5,1,23,3,1:25.664,1:25.452,1:27.079,2023-10-16T16:35:04.633+0000
4,18,13,6,2,4,1:25.994,1:25.691,1:27.178,2023-10-16T16:35:04.633+0000
5,18,2,2,3,5,1:25.960,1:25.518,1:27.236,2023-10-16T16:35:04.633+0000
6,18,15,7,11,6,1:26.427,1:26.101,1:28.527,2023-10-16T16:35:04.633+0000
7,18,3,3,7,7,1:26.295,1:26.059,1:28.687,2023-10-16T16:35:04.633+0000
8,18,14,9,9,8,1:26.381,1:26.063,1:29.041,2023-10-16T16:35:04.633+0000
9,18,10,7,12,9,1:26.919,1:26.164,1:29.593,2023-10-16T16:35:04.633+0000
10,18,20,5,15,10,1:26.702,1:25.842,\N,2023-10-16T16:35:04.633+0000
