# Data Ingestion Races

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Import CSV data

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType
from pyspark.sql.functions import current_timestamp, lit, concat, to_timestamp, substring

In [0]:
race_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                            StructField("year", IntegerType(), True),
                            StructField("round", IntegerType(), True),
                            StructField("circuitId", IntegerType(), True),
                            StructField("name", StringType(), True),
                            StructField("date", DateType(), True),
                            StructField("time", StringType(), True),
                            StructField("url", StringType(), True),
                           
                           ])

In [0]:
df = spark.read.csv(f"{raw_folder_path}/{v_file_date}/races.csv", header=True, schema=race_schema)

In [0]:
# display(df)
df.show(5)

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
only showing top 5 rows



In [0]:
df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuitId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



### Step 2 - Transform data

In [0]:
transformed_df = df.withColumnRenamed("raceId", "race_id").withColumnRenamed("year", "race_year").withColumnRenamed("circuitId", "circuit_id").withColumn("env", lit(env)).withColumn("ingestion_date", current_timestamp()).drop("url")

In [0]:
# display(transformed_df)
transformed_df.show(5)

+-------+---------+-----+----------+--------------------+----------+--------+---+--------------------+
|race_id|race_year|round|circuit_id|                name|      date|    time|env|      ingestion_date|
+-------+---------+-----+----------+--------------------+----------+--------+---+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29|06:00:00|Dev|2023-06-17 16:34:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05|09:00:00|Dev|2023-06-17 16:34:...|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19|07:00:00|Dev|2023-06-17 16:34:...|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2009-04-26|12:00:00|Dev|2023-06-17 16:34:...|
|      5|     2009|    5|         4|  Spanish Grand Prix|2009-05-10|12:00:00|Dev|2023-06-17 16:34:...|
+-------+---------+-----+----------+--------------------+----------+--------+---+--------------------+
only showing top 5 rows



In [0]:
final_df = transformed_df.select(transformed_df.race_id, transformed_df.race_year, transformed_df.round, transformed_df.circuit_id, transformed_df.name, to_timestamp(concat(transformed_df.date, lit(' '), substring(to_timestamp(transformed_df.time), 12, 9))).alias("race_timestamp"), transformed_df.ingestion_date).withColumn("file_date", lit(v_file_date))

In [0]:
# display(final_df)
final_df.show(5)

+-------+---------+-----+----------+--------------------+-------------------+--------------------+----------+
|race_id|race_year|round|circuit_id|                name|     race_timestamp|      ingestion_date| file_date|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+----------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29 06:00:00|2023-06-17 16:34:...|2021-03-21|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05 09:00:00|2023-06-17 16:34:...|2021-03-21|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19 07:00:00|2023-06-17 16:34:...|2021-03-21|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2009-04-26 12:00:00|2023-06-17 16:34:...|2021-03-21|
|      5|     2009|    5|         4|  Spanish Grand Prix|2009-05-10 12:00:00|2023-06-17 16:34:...|2021-03-21|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+----------+
only showi

### Step 3 - Load data into Data Lake

In [0]:
dbutils.fs.rm("dbfs:/FileStore/tables/processed/races", True)
# final_df.write.mode("overwrite").partitionBy('race_year').parquet(f"{processed_folder_path}/races.parquet")
# final_df.write.mode("overwrite").partitionBy('race_year').format("parquet").saveAsTable("f1_processed.races")
final_df.write.mode("overwrite").partitionBy('race_year').format("delta").saveAsTable("f1_processed.races")

In [0]:
%sql
select * from f1_processed.races;

race_id,race_year,round,circuit_id,name,race_timestamp,ingestion_date,file_date
1053,2021,2,21,Emilia Romagna Grand Prix,2021-04-18T13:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1052,2021,1,3,Bahrain Grand Prix,2021-03-28T15:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1051,2021,21,1,Australian Grand Prix,2021-11-21T06:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1054,2021,3,20,TBC,,2023-06-17T16:34:38.862+0000,2021-03-21
1055,2021,4,4,Spanish Grand Prix,2021-05-09T13:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1056,2021,5,6,Monaco Grand Prix,2021-05-23T13:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1057,2021,6,73,Azerbaijan Grand Prix,2021-06-06T12:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1058,2021,7,7,Canadian Grand Prix,2021-06-13T18:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1059,2021,8,34,French Grand Prix,2021-06-27T13:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
1060,2021,9,70,Austrian Grand Prix,2021-07-04T13:00:00.000+0000,2023-06-17T16:34:38.862+0000,2021-03-21
