
## Ingest lap_times folder

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

##### Step 1 - Read the CSV file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
lap_times_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("lap", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("time", StringType(), True),                                                                      
                                    StructField("milliseconds", IntegerType(), True)                                    
                                    ])

In [0]:
lap_times_schema_df = spark.read \
.schema(lap_times_schema) \
.csv(f"{raw_folder_path}/{v_file_date}/lap_times")   #/mnt/formula1dlajay/raw/lap_times/lap_times*.csv we can use like this

In [0]:
lap_times_schema_df.count()

1124

In [0]:
display(lap_times_schema_df)

raceId,driverId,lap,position,time,milliseconds
1053,830,1,1,1:38.603,98603
1053,830,2,1,2:29.163,149163
1053,830,3,1,2:23.247,143247
1053,830,4,1,2:20.332,140332
1053,830,5,1,2:25.691,145691
1053,830,6,1,2:20.804,140804
1053,830,7,1,1:36.303,96303
1053,830,8,1,1:32.925,92925
1053,830,9,1,1:30.953,90953
1053,830,10,1,1:30.130,90130


##### Step 2 - Rename columns and add new columns
1. Rename driverId and raceId
2. Add ingestion_date with current timestamp


In [0]:
from pyspark.sql.functions import lit

In [0]:
lap_times_with_columns_df = lap_times_schema_df.withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("driverId", "driver_id") \
                                    .withColumn("data_source",lit(v_data_source)) \
                                    .withColumn("file_date",lit(v_file_date)) 

In [0]:
lap_times_with_columns_df = add_ingestion_date(lap_times_with_columns_df)

In [0]:
display(lap_times_with_columns_df)

race_id,driver_id,lap,position,time,milliseconds,data_source,file_date,ingestion_date
1053,830,1,1,1:38.603,98603,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,2,1,2:29.163,149163,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,3,1,2:23.247,143247,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,4,1,2:20.332,140332,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,5,1,2:25.691,145691,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,6,1,2:20.804,140804,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,7,1,1:36.303,96303,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,8,1,1:32.925,92925,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,9,1,1:30.953,90953,testing,2021-04-18,2024-02-04T08:35:47.951Z
1053,830,10,1,1:30.130,90130,testing,2021-04-18,2024-02-04T08:35:47.951Z


##### Step 3 - Write to output to processed container in parquet format


In [0]:
# lap_times_with_columns_df.write.mode("overwrite").parquet(f"{processed_folder_path}/lap_times")

# lap_times_with_columns_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.lap_times")
# overwrite_function(qualifying_with_columns_df, 'f1_processed', 'lap_times', 'race_id')

#https://ergast.com/docs/f1db_user_guide.txt -> only take primary id and partition column in merge condition
merge_condition = "tgt.race_id = src.race_id and tgt.driver_id = src.driver_id and tgt.lap = src.lap and tgt.race_id = src.race_id"
merge_delta_data(lap_times_with_columns_df, "f1_processed", "lap_times", "race_id", processed_folder_path, merge_condition)

In [0]:
%sql

select * from f1_processed.lap_times;

race_id,driver_id,lap,position,time,milliseconds,data_source,file_date,ingestion_date
847,20,1,1,2:18.174,138174,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,2,1,2:06.919,126919,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,3,1,2:05.303,125303,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,4,1,2:05.715,125715,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,5,1,1:36.175,96175,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,6,1,1:34.827,94827,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,7,1,1:35.452,95452,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,8,1,1:41.724,101724,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,9,1,2:13.172,133172,testing,2021-03-21,2024-02-04T08:34:26.819Z
847,20,10,1,2:08.183,128183,testing,2021-03-21,2024-02-04T08:34:26.819Z


In [0]:
dbutils.notebook.exit("Success")