In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DAG_Demo').getOrCreate()

In [2]:
spark

In [3]:
from pyspark.sql.functions import *  
from pyspark.sql.types import StructType, StructField, StringType,IntegerType

races = (
    spark.read.option("header", True).option("inferSchema", True)
         .csv("F1_Complete_Dataset/races.csv",nullValue="\\N")
         .select("raceId", "year", "name")      
)

results = (
    spark.read.option("header", True).option("inferSchema", True)
         .csv("F1_Complete_Dataset/results.csv",nullValue="\\N")
         .select("raceId","driverId","constructorId","grid","position","points","laps","milliseconds","statusId")
)

drivers = (
    spark.read.option("header", True).option("inferSchema", True)
         .csv("F1_Complete_Dataset/drivers.csv",nullValue="\\N")
         .select("driverId", concat_ws(" ", "forename", "surname").alias("driver_name"))
)


fact = (
    results.join(races, "raceId", "left")
           .join(drivers, "driverId", "left")
)
fact.show()

+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+------------------+
|driverId|raceId|constructorId|grid|position|points|laps|milliseconds|statusId|year|                name|       driver_name|
+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+------------------+
|       1|    18|            1|   1|       1|  10.0|  58|     5690616|       1|2008|Australian Grand ...|    Lewis Hamilton|
|       2|    18|            2|   5|       2|   8.0|  58|     5696094|       1|2008|Australian Grand ...|     Nick Heidfeld|
|       3|    18|            3|   7|       3|   6.0|  58|     5698779|       1|2008|Australian Grand ...|      Nico Rosberg|
|       4|    18|            4|  11|       4|   5.0|  58|     5707797|       1|2008|Australian Grand ...|   Fernando Alonso|
|       5|    18|            1|   3|       5|   4.0|  58|     5708630|       1|2008|Australian Grand ...| Heikki Kovalainen|


In [4]:
fact.where(col('year') > 2020).groupBy("year").agg(
    sum("points").alias("total_points"),
    avg("points").alias("avg_points"),
    min("points").alias("min_points"),
    max("points").alias("max_points"),
    stddev("points").alias("std_dev_points")
    
).where(col('total_points') < 2250).orderBy(fact.year.desc()).show()

+----+------------+-----------------+----------+----------+-----------------+
|year|total_points|       avg_points|min_points|max_points|   std_dev_points|
+----+------------+-----------------+----------+----------+-----------------+
|2023|      2242.0|5.095454545454546|       0.0|      26.0|7.258643564664155|
|2022|      2242.0|5.095454545454546|       0.0|      26.0|7.263662942161959|
|2021|      2189.5|4.976136363636364|       0.0|      26.0|7.154414574825085|
+----+------------+-----------------+----------+----------+-----------------+



In [5]:
spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.codegen.wholeStage", "false")

In [6]:
from pyspark.sql.functions import *  # Import all functions as we will need count,min,max,avg,sum,round,col
from pyspark.sql.types import StructType, StructField, StringType,IntegerType

races = (
    spark.read.option("header", True).option("inferSchema", True)
         .csv("F1_Complete_Dataset/races.csv",nullValue="\\N")
         .select("raceId", "year", "name")      # keep what we need
)

results = (
    spark.read.option("header", True).option("inferSchema", True)
         .csv("F1_Complete_Dataset/results.csv",nullValue="\\N")
         .select("raceId","driverId","constructorId","grid","position","points","laps","milliseconds","statusId")
)

drivers = (
    spark.read.option("header", True).option("inferSchema", True)
         .csv("F1_Complete_Dataset/drivers.csv",nullValue="\\N")
         .select("driverId", concat_ws(" ", "forename", "surname").alias("driver_name"))
)

# Enrich results with year & driver_name
fact = (
    results.join(races, "raceId", "left")
           .join(drivers, "driverId", "left")
)
fact.show()

+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+------------------+
|driverId|raceId|constructorId|grid|position|points|laps|milliseconds|statusId|year|                name|       driver_name|
+--------+------+-------------+----+--------+------+----+------------+--------+----+--------------------+------------------+
|       1|    18|            1|   1|       1|  10.0|  58|     5690616|       1|2008|Australian Grand ...|    Lewis Hamilton|
|       2|    18|            2|   5|       2|   8.0|  58|     5696094|       1|2008|Australian Grand ...|     Nick Heidfeld|
|       3|    18|            3|   7|       3|   6.0|  58|     5698779|       1|2008|Australian Grand ...|      Nico Rosberg|
|       4|    18|            4|  11|       4|   5.0|  58|     5707797|       1|2008|Australian Grand ...|   Fernando Alonso|
|       5|    18|            1|   3|       5|   4.0|  58|     5708630|       1|2008|Australian Grand ...| Heikki Kovalainen|


In [7]:
fact.where(col('year') > 2020).groupBy("year").agg(
    sum("points").alias("total_points"),
    avg("points").alias("avg_points"),
    min("points").alias("min_points"),
    max("points").alias("max_points"),
    stddev("points").alias("std_dev_points")
    
).where(col('total_points') < 2250).orderBy(fact.year.desc()).show()

+----+------------+-----------------+----------+----------+-----------------+
|year|total_points|       avg_points|min_points|max_points|   std_dev_points|
+----+------------+-----------------+----------+----------+-----------------+
|2023|      2242.0|5.095454545454546|       0.0|      26.0|7.258643564664155|
|2022|      2242.0|5.095454545454546|       0.0|      26.0|7.263662942161959|
|2021|      2189.5|4.976136363636364|       0.0|      26.0|7.154414574825085|
+----+------------+-----------------+----------+----------+-----------------+

