In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("stop", StringType(), True),
                                    StructField("lap", IntegerType(), True),
                                    StructField("time", StringType(), True),                                                                      
                                    StructField("duration", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True)
                                    ])

In [0]:
pit_stops_df = spark.read \
.schema(pit_stops_schema) \
.option("multiLine",True) \
.json(f"{raw_folder_path}/{v_file_date}/pit_stops.json")

In [0]:
display(pit_stops_df)

raceId,driverId,stop,lap,time,duration,milliseconds
1053,839,1,1,15:05:16,30.866,30866
1053,20,1,3,15:10:09,32.024,32024
1053,854,1,5,15:15:11,51.007,51007
1053,853,1,12,15:27:20,31.168,31168
1053,842,1,14,15:30:10,31.068,31068
1053,20,2,20,15:39:11,31.184,31184
1053,854,2,21,15:41:24,32.479,32479
1053,20,3,22,15:42:52,39.502,39502
1053,853,2,23,15:45:20,31.5,31500
1053,852,1,25,15:46:39,30.696,30696


##### Step 2 - Rename columns and add new columns
1. Rename driverId and raceId
2. Add ingestion_date with current timestamp


In [0]:
from pyspark.sql.functions import lit

In [0]:
pit_stops_with_columns_df = pit_stops_df.withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("driverId", "driver_id") \
                                    .withColumn("data_source",lit(v_data_source)) \
                                    .withColumn("file_date",lit(v_file_date))                                  
                                   

In [0]:
pit_stops_with_columns_df =  add_ingestion_date(pit_stops_with_columns_df)

In [0]:
display(pit_stops_with_columns_df)

race_id,driver_id,stop,lap,time,duration,milliseconds,data_source,file_date,ingestion_date
1053,839,1,1,15:05:16,30.866,30866,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,20,1,3,15:10:09,32.024,32024,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,854,1,5,15:15:11,51.007,51007,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,853,1,12,15:27:20,31.168,31168,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,842,1,14,15:30:10,31.068,31068,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,20,2,20,15:39:11,31.184,31184,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,854,2,21,15:41:24,32.479,32479,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,20,3,22,15:42:52,39.502,39502,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,853,2,23,15:45:20,31.5,31500,testing,2021-04-18,2024-02-04T08:29:14.135Z
1053,852,1,25,15:46:39,30.696,30696,testing,2021-04-18,2024-02-04T08:29:14.135Z


##### Step 4 - Write to output to processed container in parquet format


In [0]:
# pit_stops_with_columns_df.write.mode("overwrite").parquet(f"{processed_folder_path}/pit_stops")

# pit_stops_with_columns_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.pit_stops")

# overwrite_function(results_final_df, 'f1_processed', 'pit_stops', 'race_id')

merge_condition = "tgt.race_id = src.race_id and tgt.driver_id = src.driver_id and tgt.stop = src.stop and tgt.race_id = src.race_id"
merge_delta_data(pit_stops_with_columns_df, "f1_processed", "pit_stops", "race_id", processed_folder_path, merge_condition)

In [0]:
%sql

select * from f1_processed.pit_stops;

race_id,driver_id,stop,lap,time,duration,milliseconds,data_source,file_date,ingestion_date
936,833,1,6,14:15:43,26.392,26392,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,822,1,13,14:25:47,22.974,22974,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,826,1,13,14:25:51,22.072,22072,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,13,1,14,14:27:28,27.936,27936,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,830,1,14,14:27:30,22.536,22536,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,18,1,14,14:27:33,22.231,22231,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,154,1,14,14:27:34,22.805,22805,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,831,1,14,14:27:41,22.729,22729,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,807,1,15,14:28:49,22.573,22573,testing,2021-03-21,2024-02-04T08:28:21.711Z
936,832,1,15,14:28:58,25.096,25096,testing,2021-03-21,2024-02-04T08:28:21.711Z


In [0]:
%sql
select race_id, count(1) from f1_processed.pit_stops
group by race_id
order by race_id desc;

race_id,count(1)
1053,56
1052,40
1047,23
1046,39
1045,57
1044,38
1043,30
1042,25
1041,33
1040,24


In [0]:
dbutils.notebook.exit("Success")