# Ingest Qualifying folder

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Read the JSON files using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql import functions as f

In [0]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), False),
                                      StructField("constructorId", IntegerType(), False),
                                      StructField("number", IntegerType(), False),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [0]:
df = spark.read.schema(qualifying_schema).option("multiline", True).json(f"{raw_folder_path}/{v_file_date}/qualifying/")

In [0]:
df.show(5)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|     8755|  1053|       1|          131|    44|       1|1:14.823|1:14.817|1:14.411|
|     8756|  1053|     815|            9|    11|       2|1:15.395|1:14.716|1:14.446|
|     8757|  1053|     830|            9|    33|       3|1:15.109|1:14.884|1:14.498|
|     8758|  1053|     844|            6|    16|       4|1:15.413|1:14.808|1:14.740|
|     8759|  1053|     842|          213|    10|       5|1:15.548|1:14.927|1:14.790|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
only showing top 5 rows



In [0]:
df.count()

Out[65]: 20

### Step 2 - Rename columns and add new columns

In [0]:
transformed_df = df.withColumnRenamed("qualifyId", "qualify_id").withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumnRenamed("constructorId", "constructor_id").withColumn("env", f.lit(env)).withColumn("ingestion_date", f.current_timestamp())

In [0]:
transformed_df.show(5)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+---+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|env|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+---+--------------------+
|      8755|   1053|        1|           131|    44|       1|1:14.823|1:14.817|1:14.411|Dev|2023-06-17 17:11:...|
|      8756|   1053|      815|             9|    11|       2|1:15.395|1:14.716|1:14.446|Dev|2023-06-17 17:11:...|
|      8757|   1053|      830|             9|    33|       3|1:15.109|1:14.884|1:14.498|Dev|2023-06-17 17:11:...|
|      8758|   1053|      844|             6|    16|       4|1:15.413|1:14.808|1:14.740|Dev|2023-06-17 17:11:...|
|      8759|   1053|      842|           213|    10|       5|1:15.548|1:14.927|1:14.790|Dev|2023-06-17 17:11:...|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+

### Step 3 - Write the output to processed container in parquet format

In [0]:
# dbutils.fs.rm("dbfs:/FileStore/tables/processed/qualifying", True)

# transformed_df \
# .write \
# .mode("overwrite") \
# .parquet(f"{processed_folder_path}/qualifying.parquet")
# transformed_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.qualifying")

In [0]:
merge_condition = "tgt.qualify_id = src.qualify_id AND tgt.race_id = src.race_id"
merge_delta_data(transformed_df, 'f1_processed', 'qualifying', processed_folder_path, merge_condition, 'race_id')

In [0]:
%sql
SELECT race_id, COUNT(1)
FROM f1_processed.qualifying
GROUP BY 1
ORDER BY 1 DESC

race_id,count(1)
1053,20
1052,20
1047,20
1046,20
1045,20
1044,20
1043,20
1042,20
1041,20
1040,20
