In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Load gold tables
race_driver_df = spark.table("workspace.f1.f1_gold_race_driver_features")
driver_season_df = spark.table("workspace.f1.f1_gold_driver_season_stats")
constructor_season_df = spark.table("workspace.f1.f1_gold_constructor_season_stats")

display(race_driver_df.limit(5))
display(driver_season_df.limit(5))
display(constructor_season_df.limit(5))

In [0]:
perf_df = (
    race_driver_df
    .filter(F.col("race_finish_position").isNotNull())
    .filter(F.col("grid").isNotNull())
    # Optional: focus on "Finished" only for first analysis
    # .filter(F.col("statusDescription") == "Finished")
)

display(perf_df.limit(10))

In [0]:
numeric_cols = [
    "grid",
    "race_finish_position",
    "race_points",
    "best_lap_ms",
    "avg_lap_ms",
    "lap_count_recorded",
    "pit_stop_count",
    "avg_pit_stop_ms",
    "total_pit_stop_ms",
    "quali_best_position",
    "sprint_finish_position",
    "sprint_points",
    "champ_position_after_race",
    "prev_champ_position"
]

perf_df.select(numeric_cols).describe().show()

## Problem 1: What factors most strongly influence a driverâ€™s race performance in Formula 1?

In [0]:
import pandas as pd

# Sample to make conversion to pandas feasible if dataset is huge
perf_sample_pd = (
    perf_df
    .select(numeric_cols)
    .sample(withReplacement=False, fraction=0.2, seed=42)
    .toPandas()
)

corr_matrix = perf_sample_pd.corr(numeric_only=True)
corr_matrix

In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.imshow(corr_matrix, interpolation="nearest")
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
plt.colorbar()
plt.title("Correlation Matrix - Race Performance Features")
plt.tight_layout()
plt.show()

Starting grid position shows the strongest correlation with finish position, followed by qualifying position and average lap time