# Data Ingestion Races

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Import CSV data

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType
from pyspark.sql.functions import current_timestamp, lit, concat, to_timestamp, substring

In [0]:
race_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                            StructField("year", IntegerType(), True),
                            StructField("round", IntegerType(), True),
                            StructField("circuitId", IntegerType(), True),
                            StructField("name", StringType(), True),
                            StructField("date", DateType(), True),
                            StructField("time", StringType(), True),
                            StructField("url", StringType(), True),
                           
                           ])

In [0]:
df = spark.read.csv(f"{raw_folder_path}/races.csv", header=True, schema=race_schema)

In [0]:
# display(df)
df.show(5)

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
only showing top 5 rows



In [0]:
df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuitId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



### Step 2 - Transform data

In [0]:
transformed_df = df.withColumnRenamed("raceId", "race_id").withColumnRenamed("year", "race_year").withColumnRenamed("circuitId", "circuit_id").withColumn("env", lit(env)).withColumn("ingestion_date", current_timestamp()).drop("url")

In [0]:
# display(transformed_df)
transformed_df.show(5)

+-------+---------+-----+----------+--------------------+----------+--------+---+--------------------+
|race_id|race_year|round|circuit_id|                name|      date|    time|env|      ingestion_date|
+-------+---------+-----+----------+--------------------+----------+--------+---+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29|06:00:00|Dev|2023-06-02 21:02:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05|09:00:00|Dev|2023-06-02 21:02:...|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19|07:00:00|Dev|2023-06-02 21:02:...|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2009-04-26|12:00:00|Dev|2023-06-02 21:02:...|
|      5|     2009|    5|         4|  Spanish Grand Prix|2009-05-10|12:00:00|Dev|2023-06-02 21:02:...|
+-------+---------+-----+----------+--------------------+----------+--------+---+--------------------+
only showing top 5 rows



In [0]:
final_df = transformed_df.select(transformed_df.race_id, transformed_df.race_year, transformed_df.round, transformed_df.circuit_id, transformed_df.name, to_timestamp(concat(transformed_df.date, lit(' '), substring(to_timestamp(transformed_df.time), 12, 9))).alias("race_timestamp"), transformed_df.ingestion_date)

In [0]:
# display(final_df)
final_df.show(5)

+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|race_id|race_year|round|circuit_id|                name|     race_timestamp|      ingestion_date|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29 06:00:00|2023-06-02 21:02:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05 09:00:00|2023-06-02 21:02:...|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19 07:00:00|2023-06-02 21:02:...|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2009-04-26 12:00:00|2023-06-02 21:02:...|
|      5|     2009|    5|         4|  Spanish Grand Prix|2009-05-10 12:00:00|2023-06-02 21:02:...|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
only showing top 5 rows



### Step 3 - Load data into Data Lake

In [0]:
final_df.write.mode("overwrite").partitionBy('race_year').parquet(f"{processed_folder_path}/races.parquet")

In [0]:
%fs
ls "/FileStore/tables"

path,name,size,modificationTime
dbfs:/FileStore/tables/ContainsNull.csv,ContainsNull.csv,61,1685642371000
dbfs:/FileStore/tables/appl_stock.csv,appl_stock.csv,143130,1685638993000
dbfs:/FileStore/tables/circuits/,circuits/,0,0
dbfs:/FileStore/tables/circuits.csv,circuits.csv,10044,1685665880000
dbfs:/FileStore/tables/circuits.parquet/,circuits.parquet/,0,0
dbfs:/FileStore/tables/circuits1.parquet/,circuits1.parquet/,0,0
dbfs:/FileStore/tables/circuits12/,circuits12/,0,0
dbfs:/FileStore/tables/constructors.json,constructors.json,30415,1685706757000
dbfs:/FileStore/tables/constructors.parquet/,constructors.parquet/,0,0
dbfs:/FileStore/tables/drivers.json,drivers.json,180812,1685707842000


In [0]:
new_df = spark.read.parquet(f"{processed_folder_path}/races.parquet")

In [0]:
# display(new_df)
new_df.show(5)

+-------+-----+----------+--------------------+-------------------+--------------------+---------+
|race_id|round|circuit_id|                name|     race_timestamp|      ingestion_date|race_year|
+-------+-----+----------+--------------------+-------------------+--------------------+---------+
|   1053|    2|        21|Emilia Romagna Gr...|2021-04-18 13:00:00|2023-06-02 21:02:...|     2021|
|   1052|    1|         3|  Bahrain Grand Prix|2021-03-28 15:00:00|2023-06-02 21:02:...|     2021|
|   1051|   21|         1|Australian Grand ...|2021-11-21 06:00:00|2023-06-02 21:02:...|     2021|
|   1054|    3|        20|                 TBC|               null|2023-06-02 21:02:...|     2021|
|   1055|    4|         4|  Spanish Grand Prix|2021-05-09 13:00:00|2023-06-02 21:02:...|     2021|
+-------+-----+----------+--------------------+-------------------+--------------------+---------+
only showing top 5 rows

