# Prepare Presentation layer

### Step 1 - Import all required tables

In [0]:
%run "../includes/configuration"

In [0]:
circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits.parquet")

In [0]:
drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers.parquet")

In [0]:
constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors.parquet")

In [0]:
races_df = spark.read.parquet(f"{processed_folder_path}/races.parquet")

In [0]:
results_df = spark.read.parquet(f"{processed_folder_path}/results.parquet")

In [0]:
results_df.show(5)

+---------+---------+--------------+------+----+--------+-------------+--------------+------+----+----------+------------+-----------+----+----------------+-----------------+---+--------------------+-------+
|result_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|      time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|env|      ingestion_date|race_id|
+---------+---------+--------------+------+----+--------+-------------+--------------+------+----+----------+------------+-----------+----+----------------+-----------------+---+--------------------+-------+
|    19232|      657|           113|    14|  19|       1|            1|             1|   8.0| 200|3:49:17.27|    13757270|       null|null|              \N|               \N|Dev|2023-06-02 21:03:...|    800|
|    19233|      525|           114|     9|   3|       2|            2|             2|   6.0| 200|  +1:09.95|    13827220|       null|null|              \N|            

### Step 2 - Select required columns after joining tables

In [0]:
from pyspark.sql import functions as f

In [0]:
final_df = results_df.join(races_df, results_df.race_id == races_df.race_id, "left") \
                .join(drivers_df, results_df.driver_id == drivers_df.driver_id, "left") \
                .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id, "left") \
                .join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, "left") \
                .withColumn("created_date", f.current_timestamp()) \
                .select(races_df.race_year, races_df.name.alias("race_name"), f.to_date(races_df.race_timestamp).alias("race_date"), circuits_df.location.alias("circuit_location") \
                        , drivers_df.name.alias("driver_name"), drivers_df.number.alias("driver_number"), drivers_df.nationality.alias("driver_nationality"), constructors_df.name.alias("team") \
                        ,results_df.grid, results_df.fastest_lap, results_df.time.alias("race_time"), results_df.points)


In [0]:
# display(final_df)
final_df.show(5)

+---------+----------------+---------+----------------+-------------+-------------+------------------+------------+----+-----------+----------+------+
|race_year|       race_name|race_date|circuit_location|  driver_name|driver_number|driver_nationality|        team|grid|fastest_lap| race_time|points|
+---------+----------------+---------+----------------+-------------+-------------+------------------+------------+----+-----------+----------+------+
|     1954|Indianapolis 500|     null|    Indianapolis|Bill Vukovich|         null|          American|Kurtis Kraft|  19|       null|3:49:17.27|   8.0|
|     1954|Indianapolis 500|     null|    Indianapolis|  Jimmy Bryan|         null|          American|       Kuzma|   3|       null|  +1:09.95|   6.0|
|     1954|Indianapolis 500|     null|    Indianapolis| Jack McGrath|         null|          American|Kurtis Kraft|   1|       null|  +1:19.73|   5.0|
|     1954|Indianapolis 500|     null|    Indianapolis| Troy Ruttman|         null|          A

### Step 3 - Write data to the presentation layer

In [0]:
# Prepare the data for the visualization used on the site - https://www.bbc.com/sport/formula1/2020/abu-dhabi-grand-prix/results
display(final_df.filter("race_year = 2020 and race_name = 'Abu Dhabi Grand Prix'").orderBy(final_df.points.desc()))

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Max Verstappen,33,Dutch,Red Bull,1,14,1:36:28.645,25.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Valtteri Bottas,77,Finnish,Mercedes,2,40,+15.976,18.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Lewis Hamilton,44,British,Mercedes,3,37,+18.415,15.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Alexander Albon,23,Thai,Red Bull,5,42,+19.987,12.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Lando Norris,4,British,McLaren,4,53,+1:00.729,10.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Carlos Sainz,55,Spanish,McLaren,6,48,+1:05.662,8.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Daniel Ricciardo,3,Australian,Renault,11,55,+1:13.748,7.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Pierre Gasly,10,French,AlphaTauri,9,53,+1:29.718,4.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Esteban Ocon,31,French,Renault,10,47,+1:41.069,2.0
2020,Abu Dhabi Grand Prix,2020-12-13,Abu Dhabi,Lance Stroll,18,Canadian,Racing Point,8,41,+1:42.738,1.0


In [0]:
final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/race_results.parquet")