In [0]:
%run "./configuration"

In [0]:
race_results_df = spark.read.parquet(f"{presentation_folder_path}/race_results")

In [0]:
demo_df = race_results_df.where("race_year = 2020")

In [0]:
from pyspark.sql.functions import countDistinct, count, sum

In [0]:
demo_df.select(countDistinct("race_name")).show()

+-------------------------+
|count(DISTINCT race_name)|
+-------------------------+
|                       17|
+-------------------------+



In [0]:
demo_df.where("driver_name = 'Lewis Hamilton'").select(sum("points")).show()

+-----------+
|sum(points)|
+-----------+
|      347.0|
+-----------+



In [0]:
demo_df.where("driver_name = 'Lewis Hamilton'").select(sum("points"), countDistinct("race_name")) \
    .withColumnRenamed("sum(points)", "total_point") \
    .withColumnRenamed("count(DISTINCT race_name)", "number_of_races") \
    .show()

+-----------+---------------+
|total_point|number_of_races|
+-----------+---------------+
|      347.0|             16|
+-----------+---------------+



### groupby

In [0]:
demo_df.groupby("driver_name") \
    .agg(sum("points").alias("total_points"), 
         countDistinct("race_name").alias("number_of_races")
        ) \
    .show()

+------------------+------------+---------------+
|       driver_name|total_points|number_of_races|
+------------------+------------+---------------+
|       Jack Aitken|         0.0|              1|
|      Daniil Kvyat|        32.0|             17|
|   Kevin Magnussen|         1.0|             17|
|      Sergio Pérez|       125.0|             15|
|      Carlos Sainz|       105.0|             17|
|    Kimi Räikkönen|         4.0|             17|
|   Romain Grosjean|         2.0|             15|
|   Charles Leclerc|        98.0|             17|
|   Alexander Albon|       105.0|             17|
|      Lance Stroll|        75.0|             16|
|      Pierre Gasly|        75.0|             17|
|    Lewis Hamilton|       347.0|             16|
|   Nico Hülkenberg|        10.0|              3|
|  Daniel Ricciardo|       119.0|             17|
|   Valtteri Bottas|       223.0|             17|
|Antonio Giovinazzi|         4.0|             17|
|      Lando Norris|        97.0|             17|


### Window functions

In [0]:
demo_df = race_results_df.where("race_year in (2019, 2020)")

In [0]:
demo_grouped_df = demo_df.groupby("race_year", "driver_name") \
    .agg(sum("points").alias("total_points"), 
         countDistinct("race_name").alias("number_of_races")
        )

In [0]:
display(demo_grouped_df.sort(["race_year", "total_points"], ascending=[True, False]))

race_year,driver_name,total_points,number_of_races
2019,Lewis Hamilton,413.0,21
2019,Valtteri Bottas,326.0,21
2019,Max Verstappen,278.0,21
2019,Charles Leclerc,264.0,21
2019,Sebastian Vettel,240.0,21
2019,Carlos Sainz,96.0,21
2019,Pierre Gasly,95.0,21
2019,Alexander Albon,92.0,21
2019,Daniel Ricciardo,54.0,21
2019,Sergio Pérez,52.0,21


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

In [0]:
driver_rank_spec = Window.partitionBy("race_year").orderBy(desc("total_points"))

demo_grouped_df.withColumn("rank", rank().over(driver_rank_spec)).show(50)

+---------+------------------+------------+---------------+----+
|race_year|       driver_name|total_points|number_of_races|rank|
+---------+------------------+------------+---------------+----+
|     2019|    Lewis Hamilton|       413.0|             21|   1|
|     2019|   Valtteri Bottas|       326.0|             21|   2|
|     2019|    Max Verstappen|       278.0|             21|   3|
|     2019|   Charles Leclerc|       264.0|             21|   4|
|     2019|  Sebastian Vettel|       240.0|             21|   5|
|     2019|      Carlos Sainz|        96.0|             21|   6|
|     2019|      Pierre Gasly|        95.0|             21|   7|
|     2019|   Alexander Albon|        92.0|             21|   8|
|     2019|  Daniel Ricciardo|        54.0|             21|   9|
|     2019|      Sergio Pérez|        52.0|             21|  10|
|     2019|      Lando Norris|        49.0|             21|  11|
|     2019|    Kimi Räikkönen|        43.0|             21|  12|
|     2019|   Nico Hülken