In [None]:
import pandas as pd

driver_filepath = r"C:\Users\iamky\10_Learning_Python\formula_one_study\data\formula-one-data\drivers.csv"
races_filepath = r"C:\Users\iamky\10_Learning_Python\formula_one_study\data\formula-one-data\races.csv"
pitstop_filepath = r"C:\Users\iamky\10_Learning_Python\formula_one_study\data\formula-one-data\pit_stops.csv"

df_driver = pd.read_csv(driver_filepath)
df_races = pd.read_csv(races_filepath)
df_pitstop = pd.read_csv(pitstop_filepath)

# Reduce df_driver table
df_driver = df_driver[["driverId", "code"]]

# Filter grand prix name (name) from df_pitstop
df_races = df_races[df_races["name"] == "British Grand Prix"]
df_races = df_races[["raceId", "year", "circuitId", "name"]]

# Join tables
merged = pd.merge(df_driver, df_pitstop, on='driverId', how='inner', validate='one_to_many')
merged = pd.merge(df_races, merged, on='raceId', how='inner', validate='one_to_many')   

# Filter pitstop times over 5 minutes (300000 milliseconds)
#merged = merged[merged['milliseconds'] <= 300000]

# Filter pitstop times over 1 minutes (60000 milliseconds)
merged = merged[merged['milliseconds'] <= 60000]

# Add converted seconds column, 3 decimals of precision
merged['seconds'] = (merged['milliseconds'] / 1000).round(3)

print(merged.drop(columns=["raceId", "circuitId", "driverId", "time"]))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,8))

plt.subplot(3,1,1)
sns.kdeplot(data=merged, x='seconds', fill=True)
plt.title('Pitstop in Seconds KDE Plot')

plt.subplot(3,1,2)
sns.histplot(data=merged, x='seconds', kde=True, stat='density', alpha=0.7)
plt.title('Pitstop in Seconds Histogram and KDE')

plt.subplot(3,1,3)
sns.histplot(data=merged, x='seconds', cumulative=True, stat='density',
             element='step', fill=False, color='red', linewidth=2)
plt.title('Pitstop in Seconds Cumulative Density')
plt.ylabel('Cumulative Probability')

plt.tight_layout()
plt.show()


In [None]:
# Calculate KDE probability given a range.
import pandas as pd
from scipy import stats
from scipy.integrate import quad

lower = 29
upper = 35

kde = stats.gaussian_kde(merged['seconds'])

def probability_range(kde_obj, lower, upper):
    prob, _ = quad(kde_obj, lower, upper)
    return prob

prob = probability_range(kde, lower, upper)
print(f"P({lower:.2f} <= X <= {upper:.2f}) = {prob:.4f}")