### Ingest and Transform Drivers File

In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType, StringType,DateType

In [0]:
name_schema = StructType(fields = [StructField("forename",StringType(),True),
                                   StructField("surname",StringType(),True)
                                   ])

In [0]:
drivers_schema = StructType(fields = [StructField("driverId",IntegerType(),False),
                                      StructField("driverRef",StringType(),True),
                                      StructField("number",IntegerType(),True),
                                      StructField("code",StringType(),True),
                                      StructField("name",name_schema),
                                      StructField("dob",DateType(),True),
                                      StructField("nationality",StringType(),True),
                                      StructField("url",StringType(),True)
                                      ])

In [0]:
%fs
ls dbfs:/mnt/formula1dlvb/processed/

path,name,size,modificationTime
dbfs:/mnt/formula1dlvb/processed/circuits/,circuits/,0,1697386345000
dbfs:/mnt/formula1dlvb/processed/races/,races/,0,1697388459000


In [0]:
drivers_df = spark.read.schema(drivers_schema).json("dbfs:/mnt/formula1dlvb/raw/drivers.json")

In [0]:
drivers_df.printSchema()

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [0]:
display(drivers_df)

driverId,driverRef,number,code,name,dob,nationality,url
1,hamilton,44.0,HAM,"List(Lewis, Hamilton)",1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
2,heidfeld,,HEI,"List(Nick, Heidfeld)",1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
3,rosberg,6.0,ROS,"List(Nico, Rosberg)",1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
4,alonso,14.0,ALO,"List(Fernando, Alonso)",1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
5,kovalainen,,KOV,"List(Heikki, Kovalainen)",1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen
6,nakajima,,NAK,"List(Kazuki, Nakajima)",1985-01-11,Japanese,http://en.wikipedia.org/wiki/Kazuki_Nakajima
7,bourdais,,BOU,"List(Sébastien, Bourdais)",1979-02-28,French,http://en.wikipedia.org/wiki/S%C3%A9bastien_Bourdais
8,raikkonen,7.0,RAI,"List(Kimi, Räikkönen)",1979-10-17,Finnish,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C3%B6nen
9,kubica,88.0,KUB,"List(Robert, Kubica)",1984-12-07,Polish,http://en.wikipedia.org/wiki/Robert_Kubica
10,glock,,GLO,"List(Timo, Glock)",1982-03-18,German,http://en.wikipedia.org/wiki/Timo_Glock


In [0]:
from pyspark.sql.functions import col,concat,current_timestamp,lit

In [0]:
drivers_col_rename_df = drivers_df.withColumnRenamed("driverId","driver_id") \
                                  .withColumnRenamed("driverRef","driver_ref") \
                                  .withColumn("ingestion_date",current_timestamp()) \
                                  .withColumn("name",concat(col("name.forename"),lit(" "),concat(col("name.surname"))))      

In [0]:
display(drivers_col_rename_df)

driver_id,driver_ref,number,code,name,dob,nationality,url,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,2023-10-16T12:50:04.152+0000
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,2023-10-16T12:50:04.152+0000
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,2023-10-16T12:50:04.152+0000
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,2023-10-16T12:50:04.152+0000
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,2023-10-16T12:50:04.152+0000
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,http://en.wikipedia.org/wiki/Kazuki_Nakajima,2023-10-16T12:50:04.152+0000
7,bourdais,,BOU,Sébastien Bourdais,1979-02-28,French,http://en.wikipedia.org/wiki/S%C3%A9bastien_Bourdais,2023-10-16T12:50:04.152+0000
8,raikkonen,7.0,RAI,Kimi Räikkönen,1979-10-17,Finnish,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C3%B6nen,2023-10-16T12:50:04.152+0000
9,kubica,88.0,KUB,Robert Kubica,1984-12-07,Polish,http://en.wikipedia.org/wiki/Robert_Kubica,2023-10-16T12:50:04.152+0000
10,glock,,GLO,Timo Glock,1982-03-18,German,http://en.wikipedia.org/wiki/Timo_Glock,2023-10-16T12:50:04.152+0000


In [0]:
drivers_final_df = drivers_col_rename_df.drop(col("url"))

In [0]:
drivers_final_df.write.mode("overwrite").parquet("dbfs:/mnt/formula1dlvb/processed/drivers")