In [None]:
import pandas as pd

driver_filepath = r"C:\Users\iamky\10_Learning_Python\formula_one_study\data\formula-one-data\drivers.csv"
races_filepath = r"C:\Users\iamky\10_Learning_Python\formula_one_study\data\formula-one-data\races.csv"
pitstop_filepath = r"C:\Users\iamky\10_Learning_Python\formula_one_study\data\formula-one-data\pit_stops.csv"

df_driver = pd.read_csv(driver_filepath)
df_races = pd.read_csv(races_filepath)
df_pitstop = pd.read_csv(pitstop_filepath)

# Filter HAM (code) from df_driver
#df_driver = df_driver[df_driver["code"]=="WEB"]
df_driver = df_driver[["driverId", "code"]]

# Filter grand prix name (name) from df_pitstop
df_races = df_races[df_races["name"] == "British Grand Prix"]
df_races = df_races[["raceId", "year", "circuitId", "name"]]

# Join tables
merged = pd.merge(df_driver, df_pitstop, on='driverId', how='inner', validate='one_to_many')
merged = pd.merge(df_races, merged, on='raceId', how='inner', validate='one_to_many')   

# Filter pitstop times over 5 minutes (300000 milliseconds)
merged = merged[merged['milliseconds'] <= 300000]

# Add converted seconds column, 3 decimals of precision
merged['seconds'] = (merged['milliseconds'] / 1000).round(3)

print(merged.drop(columns=["raceId", "circuitId", "driverId", "time"]))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
varname = 'seconds'
# Violin plot
sns.violinplot(y=merged[varname])
mean_score = merged[varname].mean()
plt.title(f'Violin Plot {varname} (Mean = {mean_score:.2f})')

# set y limits
y_min = 15
y_max = 38
y_steps = 1
plt.ylim(y_min, y_max+1)
# set y tick marks
plt.yticks(range(y_min, y_max+1, y_steps))
plt.show

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
# Mean value
mean_val = merged["seconds"].mean()

# Standard error of the mean
sem_val = stats.sem(merged['seconds'])

# 95% confidence level
confidence = 0.95
n = len(merged['seconds'])
h = sem_val * stats.t.ppf((1+confidence) / 2, n-1) # t-distribution

ci_lower = mean_val - h
ci_upper = mean_val + h

print(f"Mean: {mean_val:.2f}")
print(f"SEM: {sem_val:.2f}")
print(f"95% CI: ({ci_lower:.2f}, {ci_upper:.2f})")


In [None]:
correlation_raw = merged['year'].corr(merged['seconds'])
print("Correlation (raw data): ", correlation_raw)

avg_times = merged.groupby('year')['seconds'].mean()
print(avg_times)



In [None]:
# independent t-test
import pandas as pd
from scipy import stats

# Declare two groups to compare (Grand Prix track)
gp1 = "Malaysian Grand Prix"
gp2 = "Japanese Grand Prix"

# Get and reduce data
df_driver = pd.read_csv(driver_filepath)
df_races = pd.read_csv(races_filepath)
df_pitstop = pd.read_csv(pitstop_filepath)

df_driver = df_driver[["driverId", "code"]]
df_races = df_races[["raceId", "year", "circuitId", "name"]]

# Join tables
merged = pd.merge(df_driver, df_pitstop, on='driverId', how='inner', validate='one_to_many')
merged = pd.merge(df_races, merged, on='raceId', how='inner', validate='one_to_many')

# Filter pitstop times over 5 minutes (300000 milliseconds)
merged = merged[merged['milliseconds'] <= 300000]

# Add converted seconds column, 3 decimals of precision
merged['seconds'] = (merged['milliseconds'] / 1000).round(3)

#print(merged.drop(columns=["raceId", "circuitId", "driverId", "time"]))

# Group data
group1 = merged.loc[merged['name']==gp1, 'seconds']
group2 = merged.loc[merged['name']==gp2, 'seconds']

# Perform independent t-test
t_stat, p_two_tail_val = stats.ttest_ind(group1, group2, equal_var=True)

# convert to one-tail ttest
# Example: H1 = mean(group1) < mean(group2)
if t_stat < 0: # t-stat in the direction of the hypothesis
    p_one_tailed = p_two_tail_val / 2
else:
    p_one_tailed = 1 - (p_two_tail_val / 2)

print(f"Comparing {gp1} vs {gp2}")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value (two-tailed): {p_two_tail_val:.4f}")
print(f"P-value (one-tailed): {p_one_tailed:.4f}")

In [None]:
# Large ttest ind for all GP pairs
import pandas as pd
from scipy import stats
import itertools
import numpy as np
from statsmodels.stats.multitest import multipletests

# Group by
grouped = merged.groupby("name")["seconds"]
groups = list(grouped.groups.keys())
print(groups)

pairs = []
pvals = []


# Calculate pval and populate pval_matrix
for g1, g2 in itertools.combinations(groups, 2):
    _, p_val = stats.ttest_ind(grouped.get_group(g1), grouped.get_group(g2), equal_var=False)
    pairs.append((g1,g2))
    pvals.append(p_val)

# Adjusted p-value with FDR method
adjusted = multipletests(pvals, method='fdr_bh')[1] 

# Initialize pval dataframe matrix
adj_matrix = pd.DataFrame(np.nan, index=groups, columns=groups)

# Fill matrix with adjusted p-values
for (g1, g2), adj_p in zip(pairs, adjusted):
    adj_matrix.loc[g1, g2] = adj_p
    adj_matrix.loc[g2, g1] = adj_p

# Set diagonal to 1 (self-comparison)
np.fill_diagonal(adj_matrix.values, 1.0)

print(adj_matrix)