# Ingest drivers.json file

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Extract file from filestore

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DateType

In [0]:
name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True)
                                ])

In [0]:
drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema, True),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)
                                   ])

In [0]:
df = spark.read.schema(drivers_schema).json(f"{raw_folder_path}/drivers.json")
# .schema(drivers_schema.json("/FileStore/tables/drivers.json")

In [0]:
# display(df)
df.show(5)

+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|driverId| driverRef|number|code|                name|       dob|nationality|                 url|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   {Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|  null| HEI|    {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|     {Nico, Rosberg}|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|  {Fernando, Alonso}|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|  null| KOV|{Heikki, Kovalainen}|1981-10-19|    Finnish|http://en.wikiped...|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
only showing top 5 rows



### Step 2 - Rename columns, drop unwanted columns and add new columns

In [0]:
from pyspark.sql import functions as f

In [0]:
transformed_df = df.withColumnRenamed("driverId", "driver_id").withColumnRenamed("driverRef", "driver_ref").withColumn("ingestion_date", f.current_timestamp()).drop("url").withColumn("name(transformed)", f.concat(df['name']['forename'], f.lit(' '), df['name']['surname'])).drop("name").withColumn("env", f.lit(env))

In [0]:
final_df = transformed_df.select("driver_id", "driver_ref", "number", "code", "name(transformed)", "dob", "nationality", "env", "ingestion_date").withColumnRenamed("name(transformed)", "name")

In [0]:
# display(final_df)
final_df.show(5)

+---------+----------+------+----+-----------------+----------+-----------+---+--------------------+
|driver_id|driver_ref|number|code|             name|       dob|nationality|env|      ingestion_date|
+---------+----------+------+----+-----------------+----------+-----------+---+--------------------+
|        1|  hamilton|    44| HAM|   Lewis Hamilton|1985-01-07|    British|Dev|2023-06-02 21:00:...|
|        2|  heidfeld|  null| HEI|    Nick Heidfeld|1977-05-10|     German|Dev|2023-06-02 21:00:...|
|        3|   rosberg|     6| ROS|     Nico Rosberg|1985-06-27|     German|Dev|2023-06-02 21:00:...|
|        4|    alonso|    14| ALO|  Fernando Alonso|1981-07-29|    Spanish|Dev|2023-06-02 21:00:...|
|        5|kovalainen|  null| KOV|Heikki Kovalainen|1981-10-19|    Finnish|Dev|2023-06-02 21:00:...|
+---------+----------+------+----+-----------------+----------+-----------+---+--------------------+
only showing top 5 rows



### Step 3 - Load file

In [0]:
final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/drivers.parquet")