In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("LazyEvaluationDemo").getOrCreate()

In [3]:
spark

In [4]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

races_schema = StructType([
     StructField("raceId", IntegerType(), True),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", StringType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True)

])

races_df = spark.read.csv("F1_Complete_Dataset/races.csv", header = True, schema = races_schema)
        

In [5]:
races_df.show(10, truncate = False)

+------+----+-----+---------+---------------------+----------+--------+-------------------------------------------------------+
|raceId|year|round|circuitId|name                 |date      |time    |url                                                    |
+------+----+-----+---------+---------------------+----------+--------+-------------------------------------------------------+
|1     |2009|1    |1        |Australian Grand Prix|2009-03-29|06:00:00|http://en.wikipedia.org/wiki/2009_Australian_Grand_Prix|
|2     |2009|2    |2        |Malaysian Grand Prix |2009-04-05|09:00:00|http://en.wikipedia.org/wiki/2009_Malaysian_Grand_Prix |
|3     |2009|3    |17       |Chinese Grand Prix   |2009-04-19|07:00:00|http://en.wikipedia.org/wiki/2009_Chinese_Grand_Prix   |
|4     |2009|4    |3        |Bahrain Grand Prix   |2009-04-26|12:00:00|http://en.wikipedia.org/wiki/2009_Bahrain_Grand_Prix   |
|5     |2009|5    |4        |Spanish Grand Prix   |2009-05-10|12:00:00|http://en.wikipedia.org/wiki/2009

In [6]:
filtered_df = races_df.filter(races_df.year == 2015)
selected_df = filtered_df.select("raceId","year","circuitId","name")
uppercase_df = selected_df.withColumnRenamed("name","CIRCUIT_NAME")

print("Transformation applied but nothing executed yet.")

Transformation applied but nothing executed yet.


In [7]:
uppercase_df.show(n = 5)

+------+----+---------+-------------------+
|raceId|year|circuitId|       CIRCUIT_NAME|
+------+----+---------+-------------------+
|   931|2015|        6|  Monaco Grand Prix|
|   932|2015|        7|Canadian Grand Prix|
|   929|2015|        3| Bahrain Grand Prix|
|   930|2015|        4| Spanish Grand Prix|
|   928|2015|       17| Chinese Grand Prix|
+------+----+---------+-------------------+
only showing top 5 rows



In [8]:
count_df = uppercase_df.filter(uppercase_df.CIRCUIT_NAME == "Canadian Grand prix")
print("No Execution yet!")

No Execution yet!


In [9]:
print("Number of British drivers:", count_df.count())

Number of British drivers: 0


In [13]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

races = (
    spark.read.option("header",True).option("inferSchema", True)
        .csv("F1_Complete_Dataset/races.csv",nullValue="\\n")
        .select("raceId","year","name")
)

results = (
    spark.read.option("header", True).option("inferSchema", True)
        .csv("F1_Complete_Dataset/results.csv",nullValue="\\N")
         .select("raceId","driverId","constructorId","grid","position","points","laps","milliseconds","statusId")
)

drivers = (
    spark.read.option("header", True).option("inferSchema",True)
        .csv("F1_Complete_Dataset/drivers.csv",nullValue="\\N")
         .select("driverId", concat_ws(" ", "forename", "surname").alias("driver_name"))
)

fact = (
    results.join(races, "raceId", "left")
           .join(drivers,"driverId","left")
)

fact.show()
    

+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+------------------+
|driverId|raceId|constructorId|grid|position|points|laps|milliseconds|statusId|year|                name|       driver_name|
+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+------------------+
|       1|    18|            1|   1|       1|  10.0|  58|     5690616|       1|2008|Australian Grand ...|    Lewis Hamilton|
|       2|    18|            2|   5|       2|   8.0|  58|     5696094|       1|2008|Australian Grand ...|     Nick Heidfeld|
|       3|    18|            3|   7|       3|   6.0|  58|     5698779|       1|2008|Australian Grand ...|      Nico Rosberg|
|       4|    18|            4|  11|       4|   5.0|  58|     5707797|       1|2008|Australian Grand ...|   Fernando Alonso|
|       5|    18|            1|   3|       5|   4.0|  58|     5708630|       1|2008|Australian Grand ...| Heikki Kovalainen|


In [14]:
fact.count()

26759

In [16]:
fact.agg(count('*').alias('TotalNumRec')).show()

+-----------+
|TotalNumRec|
+-----------+
|      26759|
+-----------+



In [17]:
fact.agg(count('milliseconds').alias('TotalNumRaces')).show()

+-------------+
|TotalNumRaces|
+-------------+
|         7680|
+-------------+



In [18]:
fact.agg(max('points').alias('Max_Points')).show()

+----------+
|Max_Points|
+----------+
|      50.0|
+----------+



In [19]:
fact.agg(count('*').alias('Total_Num_Rec'),sum('points').alias('Total_Points'),avg('points').alias('Avg_Points')).show()

+-------------+------------+------------------+
|Total_Num_Rec|Total_Points|        Avg_Points|
+-------------+------------+------------------+
|        26759|    53187.05|1.9876321985126502|
+-------------+------------+------------------+



In [20]:
fact.groupBy('driver_name').agg(sum('points')).show()

+--------------------+-----------+
|         driver_name|sum(points)|
+--------------------+-----------+
|  Piercarlo Ghinzani|        2.0|
|Alessandro Pesent...|        0.0|
|      Richie Ginther|      107.0|
|       Bill Vukovich|       19.0|
|         Leslie Marr|        0.0|
|         John Barber|        0.0|
|         Hideki Noda|        0.0|
|        Alex Ribeiro|        0.0|
|     François Cevert|       89.0|
|     Lance Reventlow|        0.0|
|           Tony Rolt|        0.0|
|         Travis Webb|        0.0|
|      Bill Schindler|        0.0|
|      Hector Rebaque|       13.0|
|      Lella Lombardi|        0.5|
|           Ivor Bueb|        0.0|
|    Giovanni Lavaggi|        0.0|
|      Andy Sutcliffe|        0.0|
|       Innes Ireland|       47.0|
|        Heini Walter|        0.0|
+--------------------+-----------+
only showing top 20 rows



In [21]:
fact.groupBy('driver_name').agg(sum('points').alias('Total_Points_by_Driver')).orderBy(desc('Total_Points_by_Driver')).show()

+------------------+----------------------+
|       driver_name|Total_Points_by_Driver|
+------------------+----------------------+
|    Lewis Hamilton|                4820.5|
|  Sebastian Vettel|                3098.0|
|    Max Verstappen|                2912.5|
|   Fernando Alonso|                2329.0|
|    Kimi Räikkönen|                1873.0|
|   Valtteri Bottas|                1788.0|
|      Nico Rosberg|                1594.5|
|      Sergio Pérez|                1585.0|
|Michael Schumacher|                1566.0|
|   Charles Leclerc|                1363.0|
|  Daniel Ricciardo|                1320.0|
|     Jenson Button|                1235.0|
|      Carlos Sainz|                1203.5|
|      Felipe Massa|                1167.0|
|       Mark Webber|                1047.5|
|      Lando Norris|                 950.0|
|       Alain Prost|                 798.5|
|    George Russell|                 664.0|
|Rubens Barrichello|                 658.0|
|      Ayrton Senna|            

In [22]:
fact.groupBy("year").agg(
    count("*").alias("total_rows"),
    count("position").alias("total_non_null_rows")
).orderBy(fact.year.desc()).show()

+----+----------+-------------------+
|year|total_rows|total_non_null_rows|
+----+----------+-------------------+
|2024|       479|                431|
|2023|       440|                386|
|2022|       440|                376|
|2021|       440|                388|
|2020|       340|                287|
|2019|       420|                370|
|2018|       420|                340|
|2017|       400|                314|
|2016|       462|                383|
|2015|       378|                303|
|2014|       407|                327|
|2013|       418|                369|
|2012|       480|                402|
|2011|       456|                374|
|2010|       456|                347|
|2009|       340|                279|
|2008|       368|                287|
|2007|       374|                281|
|2006|       396|                274|
|2005|       376|                279|
+----+----------+-------------------+
only showing top 20 rows



In [23]:
fact.groupBy("year").agg(
    sum("points").alias("total_points"),
    avg("points").alias("avg_points"),
    min("points").alias("min_points"),
    max("points").alias("max_points"),
    stddev("points").alias("std_dev_points")
).orderBy(fact.year.desc()).show(10)
    

+----+------------+------------------+----------+----------+-----------------+
|year|total_points|        avg_points|min_points|max_points|   std_dev_points|
+----+------------+------------------+----------+----------+-----------------+
|2024|      2443.0|5.1002087682672235|       0.0|      26.0|7.231843515023035|
|2023|      2242.0| 5.095454545454546|       0.0|      26.0|7.258643564664155|
|2022|      2242.0| 5.095454545454546|       0.0|      26.0|7.263662942161959|
|2021|      2189.5| 4.976136363636364|       0.0|      26.0|7.154414574825085|
|2020|      1734.0|               5.1|       0.0|      26.0|7.257302293426333|
|2019|      2140.0| 5.095238095238095|       0.0|      26.0|7.259622940725277|
|2018|      2121.0|              5.05|       0.0|      25.0|7.181245462744008|
|2017|      2020.0|              5.05|       0.0|      25.0| 7.18167397636212|
|2016|      2121.0| 4.590909090909091|       0.0|      25.0|6.998866009508219|
|2015|      1919.0| 5.076719576719577|       0.0|   

In [24]:
fact.groupBy("year").agg(countDistinct("driverId").alias("distinct_drivers")).orderBy(fact.year.desc()).show(50)

+----+----------------+
|year|distinct_drivers|
+----+----------------+
|2024|              24|
|2023|              22|
|2022|              22|
|2021|              21|
|2020|              23|
|2019|              20|
|2018|              20|
|2017|              25|
|2016|              24|
|2015|              22|
|2014|              24|
|2013|              23|
|2012|              25|
|2011|              28|
|2010|              27|
|2009|              25|
|2008|              22|
|2007|              26|
|2006|              27|
|2005|              27|
|2004|              25|
|2003|              24|
|2002|              23|
|2001|              26|
|2000|              23|
|1999|              24|
|1998|              23|
|1997|              28|
|1996|              24|
|1995|              35|
|1994|              46|
|1993|              35|
|1992|              37|
|1991|              41|
|1990|              40|
|1989|              47|
|1988|              36|
|1987|              32|
|1986|          

In [27]:
fact.groupBy("year").agg(countDistinct("driverId").alias("distinct_drivers")).where(col('distinct_drivers')>50).show()

+----+----------------+
|year|distinct_drivers|
+----+----------------+
|1959|              88|
|1975|              52|
|1977|              61|
|1974|              62|
|1955|              84|
|1961|              62|
|1952|             105|
|1956|              85|
|1951|              84|
|1950|              81|
|1957|              76|
|1963|              62|
|1965|              54|
|1960|              91|
|1953|             108|
|1958|              87|
|1954|              97|
|1976|              54|
|1962|              61|
+----+----------------+



In [29]:
fact.groupBy("driver_name").agg(sum("points").alias("sum_points")).where(col('sum_points')>2000).orderBy(desc('sum_points')).show()

+----------------+----------+
|     driver_name|sum_points|
+----------------+----------+
|  Lewis Hamilton|    4820.5|
|Sebastian Vettel|    3098.0|
|  Max Verstappen|    2912.5|
| Fernando Alonso|    2329.0|
+----------------+----------+



In [30]:
fact.where(col('year')>2020).groupBy("year").agg(
     sum("points").alias("total_points"),
    avg("points").alias("avg_points"),
    min("points").alias("min_points"),
    max("points").alias("max_points"),
    stddev("points").alias("std_dev_points")
).where(col('total_points')>2000).orderBy(fact.year.desc()).show(10)
    

+----+------------+------------------+----------+----------+-----------------+
|year|total_points|        avg_points|min_points|max_points|   std_dev_points|
+----+------------+------------------+----------+----------+-----------------+
|2024|      2443.0|5.1002087682672235|       0.0|      26.0|7.231843515023035|
|2023|      2242.0| 5.095454545454546|       0.0|      26.0|7.258643564664155|
|2022|      2242.0| 5.095454545454546|       0.0|      26.0|7.263662942161959|
|2021|      2189.5| 4.976136363636364|       0.0|      26.0|7.154414574825085|
+----+------------+------------------+----------+----------+-----------------+



In [31]:
fact.groupBy("year").agg(
    sum("points").alias("total_points"),
    avg("points").alias("avg_points"),
    min("points").alias("min_points"),
    max("points").alias("max_points"),
    stddev("points").alias("std_dev_points")
).where(col('total_points')>2000).orderBy(fact.year.desc()).where(col('year')>2020).show(5)
    

+----+------------+------------------+----------+----------+-----------------+
|year|total_points|        avg_points|min_points|max_points|   std_dev_points|
+----+------------+------------------+----------+----------+-----------------+
|2024|      2443.0|5.1002087682672235|       0.0|      26.0|7.231843515023035|
|2023|      2242.0| 5.095454545454546|       0.0|      26.0|7.258643564664155|
|2022|      2242.0| 5.095454545454546|       0.0|      26.0|7.263662942161959|
|2021|      2189.5| 4.976136363636364|       0.0|      26.0|7.154414574825085|
+----+------------+------------------+----------+----------+-----------------+



In [32]:
fact.orderBy(fact.year.desc()).groupBy("year").agg(
    sum("points").alias("total_points"),
    avg("points").alias("avg_points"),
    min("points").alias("min_points"),
    max("points").alias("max_points"),
    stddev("points").alias("std_dev_points")
).where(sum("points")>2000).where(col('year')>2020).show(5)

+----+------------+------------------+----------+----------+-----------------+
|year|total_points|        avg_points|min_points|max_points|   std_dev_points|
+----+------------+------------------+----------+----------+-----------------+
|2024|      2443.0|5.1002087682672235|       0.0|      26.0|7.231843515023035|
|2023|      2242.0| 5.095454545454546|       0.0|      26.0|7.258643564664155|
|2022|      2242.0| 5.095454545454546|       0.0|      26.0|7.263662942161959|
|2021|      2189.5| 4.976136363636364|       0.0|      26.0|7.154414574825085|
+----+------------+------------------+----------+----------+-----------------+



In [33]:
driver_year_points = (
    fact.groupBy("year","driver_name")
        .agg(sum("points").alias("points"))
        .orderBy("year", desc("points"))
)
driver_year_points.show(20,truncate = False)

+----+------------------+------+
|year|driver_name       |points|
+----+------------------+------+
|1950|Nino Farina       |30.0  |
|1950|Luigi Fagioli     |28.0  |
|1950|Juan Fangio       |27.0  |
|1950|Louis Rosier      |13.0  |
|1950|Alberto Ascari    |11.0  |
|1950|Johnnie Parsons   |9.0   |
|1950|Bill Holland      |6.0   |
|1950|Prince Bira       |5.0   |
|1950|Peter Whitehead   |4.0   |
|1950|Louis Chiron      |4.0   |
|1950|Reg Parnell       |4.0   |
|1950|Mauri Rose        |4.0   |
|1950|Philippe Étancelin|3.0   |
|1950|Dorino Serafini   |3.0   |
|1950|Robert Manzon     |3.0   |
|1950|Raymond Sommer    |3.0   |
|1950|Cecil Green       |3.0   |
|1950|Yves Cabantous    |3.0   |
|1950|Felice Bonetto    |2.0   |
|1950|Tony Bettenhausen |1.0   |
+----+------------------+------+
only showing top 20 rows



In [34]:
driver_year_points.filter(col("driver_name") == 'Lewis Hamilton').orderBy(desc('year')).show()

+----+--------------+------+
|year|   driver_name|points|
+----+--------------+------+
|2024|Lewis Hamilton| 207.0|
|2023|Lewis Hamilton| 217.0|
|2022|Lewis Hamilton| 233.0|
|2021|Lewis Hamilton| 385.5|
|2020|Lewis Hamilton| 347.0|
|2019|Lewis Hamilton| 413.0|
|2018|Lewis Hamilton| 408.0|
|2017|Lewis Hamilton| 363.0|
|2016|Lewis Hamilton| 380.0|
|2015|Lewis Hamilton| 381.0|
|2014|Lewis Hamilton| 384.0|
|2013|Lewis Hamilton| 189.0|
|2012|Lewis Hamilton| 190.0|
|2011|Lewis Hamilton| 227.0|
|2010|Lewis Hamilton| 240.0|
|2009|Lewis Hamilton|  49.0|
|2008|Lewis Hamilton|  98.0|
|2007|Lewis Hamilton| 109.0|
+----+--------------+------+



In [35]:
driver_year_points.filter(col("driver_name") == 'Max Verstappen').orderBy(desc('year')).show()

+----+--------------+------+
|year|   driver_name|points|
+----+--------------+------+
|2024|Max Verstappen| 399.0|
|2023|Max Verstappen| 530.0|
|2022|Max Verstappen| 433.0|
|2021|Max Verstappen| 388.5|
|2020|Max Verstappen| 214.0|
|2019|Max Verstappen| 278.0|
|2018|Max Verstappen| 249.0|
|2017|Max Verstappen| 168.0|
|2016|Max Verstappen| 204.0|
|2015|Max Verstappen|  49.0|
+----+--------------+------+



In [36]:
driver_year_points.filter(col("driver_name") == 'Michael Schumacher').orderBy(desc('year')).show()

+----+------------------+------+
|year|       driver_name|points|
+----+------------------+------+
|2012|Michael Schumacher|  49.0|
|2011|Michael Schumacher|  76.0|
|2010|Michael Schumacher|  72.0|
|2006|Michael Schumacher| 121.0|
|2005|Michael Schumacher|  62.0|
|2004|Michael Schumacher| 148.0|
|2003|Michael Schumacher|  93.0|
|2002|Michael Schumacher| 144.0|
|2001|Michael Schumacher| 123.0|
|2000|Michael Schumacher| 108.0|
|1999|Michael Schumacher|  44.0|
|1998|Michael Schumacher|  86.0|
|1997|Michael Schumacher|  78.0|
|1996|Michael Schumacher|  59.0|
|1995|Michael Schumacher| 102.0|
|1994|Michael Schumacher|  92.0|
|1993|Michael Schumacher|  52.0|
|1992|Michael Schumacher|  53.0|
|1991|Michael Schumacher|   4.0|
+----+------------------+------+



In [37]:
driver_year_points.filter(col("points") > 200).orderBy(desc('year'),desc('points')).show()

+----+---------------+------+
|year|    driver_name|points|
+----+---------------+------+
|2024| Max Verstappen| 399.0|
|2024|   Lando Norris| 344.0|
|2024|Charles Leclerc| 327.0|
|2024|  Oscar Piastri| 265.0|
|2024|   Carlos Sainz| 262.0|
|2024| George Russell| 226.0|
|2024| Lewis Hamilton| 207.0|
|2023| Max Verstappen| 530.0|
|2023|   Sergio Pérez| 260.0|
|2023| Lewis Hamilton| 217.0|
|2022| Max Verstappen| 433.0|
|2022|Charles Leclerc| 291.0|
|2022|   Sergio Pérez| 291.0|
|2022| George Russell| 262.0|
|2022| Lewis Hamilton| 233.0|
|2022|   Carlos Sainz| 228.0|
|2021| Max Verstappen| 388.5|
|2021| Lewis Hamilton| 385.5|
|2021|Valtteri Bottas| 219.0|
|2020| Lewis Hamilton| 347.0|
+----+---------------+------+
only showing top 20 rows



In [38]:
podiums = (
    fact.groupBy("year", "driver_name")
        .agg(sum(when(col("position").isin(1,2,3), 1).otherwise(0)).alias("podiums"))
        .orderBy(desc("year"), desc("podiums"))

)
podiums.show(n =50,truncate = False)

+----+----------------+-------+
|year|driver_name     |podiums|
+----+----------------+-------+
|2024|Max Verstappen  |14     |
|2024|Lando Norris    |13     |
|2024|Charles Leclerc |13     |
|2024|Carlos Sainz    |9      |
|2024|Oscar Piastri   |8      |
|2024|Lewis Hamilton  |5      |
|2024|Sergio Pérez    |4      |
|2024|George Russell  |4      |
|2024|Pierre Gasly    |1      |
|2024|Esteban Ocon    |1      |
|2024|Franco Colapinto|0      |
|2024|Oliver Bearman  |0      |
|2024|Valtteri Bottas |0      |
|2024|Alexander Albon |0      |
|2024|Jack Doohan     |0      |
|2024|Lance Stroll    |0      |
|2024|Logan Sargeant  |0      |
|2024|Fernando Alonso |0      |
|2024|Yuki Tsunoda    |0      |
|2024|Guanyu Zhou     |0      |
|2024|Kevin Magnussen |0      |
|2024|Liam Lawson     |0      |
|2024|Nico Hülkenberg |0      |
|2024|Daniel Ricciardo|0      |
|2023|Max Verstappen  |21     |
|2023|Sergio Pérez    |9      |
|2023|Fernando Alonso |8      |
|2023|Lando Norris    |7      |
|2023|Le

In [39]:
podiums.groupBy('driver_name').agg(sum('podiums').alias('Total_Wins')).orderBy(desc('Total_Wins')).show()

+------------------+----------+
|       driver_name|Total_Wins|
+------------------+----------+
|    Lewis Hamilton|       202|
|Michael Schumacher|       155|
|  Sebastian Vettel|       122|
|    Max Verstappen|       112|
|       Alain Prost|       106|
|   Fernando Alonso|       106|
|    Kimi Räikkönen|       103|
|      Ayrton Senna|        80|
|Rubens Barrichello|        68|
|   Valtteri Bottas|        67|
|   David Coulthard|        62|
|     Nelson Piquet|        60|
|     Nigel Mansell|        59|
|      Nico Rosberg|        57|
|        Niki Lauda|        54|
|     Mika Häkkinen|        51|
|     Jenson Button|        50|
|    Gerhard Berger|        48|
|  Carlos Reutemann|        45|
|   Charles Leclerc|        43|
+------------------+----------+
only showing top 20 rows



In [40]:
avg_position = (
    fact.groupBy("year","driver_name")
        .agg(avg("position").alias("avg_position"))
        .orderBy(desc("year"),asc("avg_position"))
)
avg_position.show(5,truncate = False)

+----+---------------+------------------+
|year|driver_name    |avg_position      |
+----+---------------+------------------+
|2024|Max Verstappen |2.9565217391304346|
|2024|Charles Leclerc|3.9130434782608696|
|2024|Lando Norris   |4.291666666666667 |
|2024|Carlos Sainz   |4.714285714285714 |
|2024|Oscar Piastri  |5.125             |
+----+---------------+------------------+
only showing top 5 rows



In [41]:
avg_position = (
    fact.groupBy("year","driver_name")
    .agg(avg(coalesce("position", lit(25))).alias("avg_position"))
    .orderBy(desc("year"), asc("avg_position"))
)
avg_position.show(5,truncate = False)

+----+---------------+-----------------+
|year|driver_name    |avg_position     |
+----+---------------+-----------------+
|2024|Max Verstappen |3.875            |
|2024|Lando Norris   |4.291666666666667|
|2024|Charles Leclerc|4.791666666666667|
|2024|Oscar Piastri  |5.125            |
|2024|Carlos Sainz   |6.478260869565218|
+----+---------------+-----------------+
only showing top 5 rows



In [42]:
fact.filter((col('year')==2024) & (col('driver_name') == 'Max Verstappen')).show()

+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+--------------+
|driverId|raceId|constructorId|grid|position|points|laps|milliseconds|statusId|year|                name|   driver_name|
+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+--------------+
|     830|  1121|            9|   1|       1|  26.0|  57|     5504742|       1|2024|  Bahrain Grand Prix|Max Verstappen|
|     830|  1122|            9|   1|       1|  25.0|  50|     4843273|       1|2024|Saudi Arabian Gra...|Max Verstappen|
|     830|  1123|            9|   1|    NULL|   0.0|   3|        NULL|      23|2024|Australian Grand ...|Max Verstappen|
|     830|  1124|            9|   1|       1|  26.0|  53|     6863566|       1|2024| Japanese Grand Prix|Max Verstappen|
|     830|  1125|            9|   1|       1|  25.0|  56|     6052554|       1|2024|  Chinese Grand Prix|Max Verstappen|
|     830|  1126|            9| 

In [43]:
fact.createOrReplaceTempView("fact")
spark.sql("""
    SELECT year,driver_name, SUM(points) AS points
    FROM fact
    GROUP BY year, driver_name
    HAVING SUM(points) > 200
    ORDER BY year desc, points DESC
    """).show(5)

+----+---------------+------+
|year|    driver_name|points|
+----+---------------+------+
|2024| Max Verstappen| 399.0|
|2024|   Lando Norris| 344.0|
|2024|Charles Leclerc| 327.0|
|2024|  Oscar Piastri| 265.0|
|2024|   Carlos Sainz| 262.0|
+----+---------------+------+
only showing top 5 rows

