In [14]:
import pandas as pd

# Load simulation (update path if needed)
sim = pd.read_csv("../results/simulation_montecarlo_bahrain_2023_dry.csv")

# Load training dataset
train = pd.read_csv("../data/processed/f1_over_under_multiclass_dataset.csv")

# Clean column names (strip spaces!)
train.columns = train.columns.str.strip()
sim.columns = sim.columns.str.strip()

print("Simulation columns:", sim.columns.tolist())
print("Training columns:", train.columns.tolist())

Simulation columns: ['driverId', 'team_name', 'grid', 'p_under_model', 'p_neutral_model', 'p_over_model', 'mean_rank', 'median_rank', 'std_rank', 'prob_top3', 'prob_top5', 'prob_bottom5', 'prob_label2_MC']
Training columns: ['year', 'round', 'raceId', 'driverId', 'constructorId', 'team_name', 'grid', 'driver_standings_position_pre_race', 'driver_standings_points_pre_race', 'constructor_standings_position_pre_race', 'constructor_standings_points_pre_race', 'airtemp_mean', 'tracktemp_mean', 'humidity_mean', 'pressure_mean', 'windspeed_mean', 'rainfall_any', 'label']


In [15]:
# === A) Verify that the simulated race does NOT appear in the training dataset ===

# Declare simulated race identifiers manually
sim_year = 2023
sim_round = 1

# Check existence
exists = train[(train["year"] == sim_year) & (train["round"] == sim_round)]

if exists.empty:
    print(f"OK ✓  → Race {sim_year} round {sim_round} is NOT inside the training dataset.")
else:
    print("WARNING ⚠️  → The simulated race IS inside the training dataset!")
    print(exists.head())

     year  round  raceId  driverId  constructorId        team_name  grid  \
880  2023      1    1098       830              9  Red Bull            1   
881  2023      1    1098       815              9  Red Bull            2   
882  2023      1    1098         4            117  Aston Martin        5   
883  2023      1    1098       832              6  Ferrari             4   
884  2023      1    1098         1            131  Mercedes            7   

      driver_standings_position_pre_race   driver_standings_points_pre_race  \
880                                                                           
881                                                                           
882                                                                           
883                                                                           
884                                                                           

      constructor_standings_position_pre_race  \
880                

In [16]:
# === B) Check for forbidden (post-race) features in the simulation ===

forbidden_features = [
    "finish_position", "finish_text", "points",
    "driver_standings_position_post_race",
    "driver_standings_points_post_race",
    "delta_position",
]

sim_intersection = [col for col in sim.columns if col in forbidden_features]

if len(sim_intersection) == 0:
    print("OK ✓  → Simulation contains NO post-race leakage features.")
else:
    print("WARNING ⚠️  → Simulation contains forbidden features:", sim_intersection)

OK ✓  → Simulation contains NO post-race leakage features.


In [17]:
# === C) Consistency checks ===

# Filter the training dataset for season 2023
train_2023 = train[train["year"] == 2023]

# 1. Check driverId overlap
sim_drivers = set(sim["driverId"])
train_drivers = set(train_2023["driverId"])

missing_drivers = sim_drivers - train_drivers

if len(missing_drivers) == 0:
    print("OK ✓  → All drivers in simulation exist in the 2023 dataset.")
else:
    print("WARNING ⚠️ Missing drivers:", missing_drivers)

# 2. Check team_name consistency
merged = sim.merge(train_2023[["driverId", "team_name"]], on="driverId", how="left", suffixes=("_sim", "_train"))

inconsistent = merged[merged["team_name_sim"] != merged["team_name_train"]]

if inconsistent.empty:
    print("OK ✓  → Team names match training dataset.")
else:
    print("WARNING ⚠️ Team mismatch detected:")
    display(inconsistent[["driverId", "team_name_sim", "team_name_train"]])

# 3. Check grid values
train_grids = set(train_2023["grid"])
sim_grids = set(sim["grid"])

invalid_grids = sim_grids - train_grids

if len(invalid_grids) == 0:
    print("OK ✓  → All grid values in simulation appear in training dataset.")
else:
    print("WARNING ⚠️ Unexpected grid numbers in simulation:", invalid_grids)

OK ✓  → All drivers in simulation exist in the 2023 dataset.
OK ✓  → Team names match training dataset.
OK ✓  → All grid values in simulation appear in training dataset.
