# Ingest Qualifying folder

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Read the JSON files using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql import functions as f

In [0]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), False),
                                      StructField("constructorId", IntegerType(), False),
                                      StructField("number", IntegerType(), False),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [0]:
df = spark.read.schema(qualifying_schema).option("multiline", True).json(f"{raw_folder_path}/qualifying/")

In [0]:
df.show(5)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|        1|    18|       1|            1|    22|       1|1:26.572|1:25.187|1:26.714|
|        2|    18|       9|            2|     4|       2|1:26.103|1:25.315|1:26.869|
|        3|    18|       5|            1|    23|       3|1:25.664|1:25.452|1:27.079|
|        4|    18|      13|            6|     2|       4|1:25.994|1:25.691|1:27.178|
|        5|    18|       2|            2|     3|       5|1:25.960|1:25.518|1:27.236|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
only showing top 5 rows



In [0]:
df.count()

Out[8]: 8694

### Step 2 - Rename columns and add new columns

In [0]:
transformed_df = df.withColumnRenamed("qualifyId", "qualify_id").withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumnRenamed("constructorId", "constructor_id").withColumn("env", f.lit(env)).withColumn("ingestion_date", f.current_timestamp())

In [0]:
transformed_df.show(5)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+---+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|env|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+---+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|Dev|2023-06-03 22:30:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|Dev|2023-06-03 22:30:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|Dev|2023-06-03 22:30:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|Dev|2023-06-03 22:30:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|Dev|2023-06-03 22:30:...|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+

### Step 3 - Write the output to processed container in parquet format

In [0]:
# transformed_df \
# .write \
# .mode("overwrite") \
# .parquet(f"{processed_folder_path}/qualifying.parquet")
transformed_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.qualifying")