In [65]:
import time
import requests
import pandas as pd

# Use Jolpica F1 API (Ergast-compatible endpoint)
BASE = "https://api.jolpi.ca/ergast/f1"
SEASONS = [2022, 2023, 2024, 2025]

In [66]:
def get_json(url: str, params=None, sleep_s: float = 0.35, max_retries: int = 8) -> dict:
    """Fetch JSON with retry/backoff for rate limiting and errors"""
    backoff = 1.0
    for attempt in range(1, max_retries + 1):
        r = requests.get(
            url,
            params=params,
            timeout=30,
            headers={"Accept": "application/json", "User-Agent": "F1-Podium-Predictor/1.0"}
        )

        # Handle rate limiting
        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            wait_s = float(retry_after) if retry_after and retry_after.isdigit() else backoff
            print(f"429 rate limit: waiting {wait_s:.1f}s (attempt {attempt}/{max_retries}) -> {r.url}")
            time.sleep(wait_s)
            backoff = min(backoff * 2, 30)
            continue

        r.raise_for_status()

        # Validate JSON response
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "json" not in ctype:
            preview = r.text[:200].replace("\n", " ")
            raise ValueError(f"Non-JSON response from {r.url} | Content-Type={ctype} | Preview={preview}")

        time.sleep(sleep_s)
        return r.json()

    raise RuntimeError(f"Failed after {max_retries} retries (last status {r.status_code}) for {url}")

In [3]:
def get_rounds_in_season(season: int) -> list[int]:
    """Returns list of round numbers in a season"""
    url = f"{BASE}/{season}.json"
    data = get_json(url)
    races = data["MRData"]["RaceTable"].get("Races", [])
    return [int(r["round"]) for r in races]

In [4]:
def get_race_results(season: int, round_no: int) -> list[dict]:
    """Returns one row per driver for a race"""
    url = f"{BASE}/{season}/{round_no}/results.json"
    data = get_json(url)
    races = data["MRData"]["RaceTable"].get("Races", [])
    if not races:
        return []
    race = races[0]
    circuit = race["Circuit"]
    results = race.get("Results", [])

    rows = []
    for res in results:
        driver = res["Driver"]
        constructor = res["Constructor"]

        # Handle missing or non-numeric finish positions
        try:
            finish_pos = int(res.get("position"))
        except (TypeError, ValueError):
            finish_pos = None

        rows.append({
            "season": int(season),
            "round": int(round_no),
            "raceName": race.get("raceName"),
            "date": race.get("date"),
            "circuitId": circuit.get("circuitId"),
            "circuitName": circuit.get("circuitName"),
            "driverId": driver.get("driverId"),
            "driverCode": driver.get("code"),
            "driverGivenName": driver.get("givenName"),
            "driverFamilyName": driver.get("familyName"),
            "constructorId": constructor.get("constructorId"),
            "constructorName": constructor.get("name"),
            "grid": int(res["grid"]) if res.get("grid") not in (None, "") else None,
            "finish_position": finish_pos,
            "points": float(res["points"]) if res.get("points") not in (None, "") else None,
            "laps": int(res["laps"]) if res.get("laps") not in (None, "") else None,
            "status": res.get("status"),
        })
    return rows

In [5]:
def get_qualifying(season: int, round_no: int) -> dict:
    """Returns dict mapping driverId to qualifying position"""
    url = f"{BASE}/{season}/{round_no}/qualifying.json"
    data = get_json(url)
    races = data["MRData"]["RaceTable"].get("Races", [])
    if not races:
        return {}
    q = races[0].get("QualifyingResults", [])
    out = {}
    for row in q:
        try:
            out[row["Driver"]["driverId"]] = int(row["position"])
        except Exception:
            pass
    return out

In [6]:
all_rows = []

for season in SEASONS:
    rounds = get_rounds_in_season(season)
    print(f"Season {season} rounds found: {len(rounds)}")

    for rnd in rounds:
        rows = get_race_results(season, rnd)
        if not rows:
            continue

        # Get qualifying results (may be missing for some races)
        try:
            q_map = get_qualifying(season, rnd)
        except Exception as e:
            print(f"Qualifying unavailable for {season} round {rnd}: {e}")
            q_map = {}

        # Add qualifying position and podium label
        for r in rows:
            r["qual_position"] = q_map.get(r["driverId"])
            r["podium"] = 1 if (r["finish_position"] is not None and r["finish_position"] <= 3) else 0

        all_rows.extend(rows)

if not all_rows:
    raise RuntimeError("No rows collected. Check internet access and that the API is reachable.")

Season 2022 rounds found: 22
Season 2023 rounds found: 22
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2023/21/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2023/22/results.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2023/22/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024.json
429 rate limit: waiting 2.0s (attempt 2/8) -> https://api.jolpi.ca/ergast/f1/2024.json
Season 2024 rounds found: 24
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/1/qualifying.json
429 rate limit: waiting 2.0s (attempt 2/8) -> https://api.jolpi.ca/ergast/f1/2024/1/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/2/qualifying.json
429 rate limit: waiting 2.0s (attempt 2/8) -> https://api.jolpi.ca/ergast/f1/2024/2/qualifying.json
429 rate limit: waiting 1.0s (attempt 1

In [7]:
# Build dataframe and handle missing data
df = pd.DataFrame(all_rows)

df = df.dropna(subset=["grid", "finish_position"]).copy()
df["qual_position"] = df["qual_position"].fillna(df["grid"])

# Save dataset
out_path = "jolpica_podium_dataset_2022_2025.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path, "rows:", len(df))

Saved: jolpica_podium_dataset_2022_2025.csv rows: 1838


In [8]:
# Load and inspect dataset
import pandas as pd

df = pd.read_csv("jolpica_podium_dataset_2022_2025.csv")

print(df.shape)
print(df.dtypes)
df.head()

(1838, 19)
season                int64
round                 int64
raceName             object
date                 object
circuitId            object
circuitName          object
driverId             object
driverCode           object
driverGivenName      object
driverFamilyName     object
constructorId        object
constructorName      object
grid                  int64
finish_position       int64
points              float64
laps                  int64
status               object
qual_position       float64
podium                int64
dtype: object


Unnamed: 0,season,round,raceName,date,circuitId,circuitName,driverId,driverCode,driverGivenName,driverFamilyName,constructorId,constructorName,grid,finish_position,points,laps,status,qual_position,podium
0,2022,1,Bahrain Grand Prix,2022-03-20,bahrain,Bahrain International Circuit,leclerc,LEC,Charles,Leclerc,ferrari,Ferrari,1,1,26.0,57,Finished,1.0,1
1,2022,1,Bahrain Grand Prix,2022-03-20,bahrain,Bahrain International Circuit,sainz,SAI,Carlos,Sainz,ferrari,Ferrari,3,2,18.0,57,Finished,3.0,1
2,2022,1,Bahrain Grand Prix,2022-03-20,bahrain,Bahrain International Circuit,hamilton,HAM,Lewis,Hamilton,mercedes,Mercedes,5,3,15.0,57,Finished,5.0,1
3,2022,1,Bahrain Grand Prix,2022-03-20,bahrain,Bahrain International Circuit,russell,RUS,George,Russell,mercedes,Mercedes,9,4,12.0,57,Finished,9.0,0
4,2022,1,Bahrain Grand Prix,2022-03-20,bahrain,Bahrain International Circuit,kevin_magnussen,MAG,Kevin,Magnussen,haas,Haas F1 Team,7,5,10.0,57,Finished,7.0,0


In [9]:
# Check for missing values
print(df.isna().sum().sort_values(ascending=False))

season              0
round               0
raceName            0
date                0
circuitId           0
circuitName         0
driverId            0
driverCode          0
driverGivenName     0
driverFamilyName    0
constructorId       0
constructorName     0
grid                0
finish_position     0
points              0
laps                0
status              0
qual_position       0
podium              0
dtype: int64


In [10]:
# Fill missing qualifying positions with grid position
df["qual_position"] = df["qual_position"].fillna(df["grid"])

In [11]:
# Convert numeric columns to proper types
numeric_cols = ["season", "round", "grid", "qual_position", "finish_position", "points", "laps"]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [12]:
# Filter out invalid positions (DNS, DNF, disqualifications)
df = df[df["grid"].between(1, 20)]
df = df[df["finish_position"].between(1, 20)]

In [13]:
# Check class balance of target variable
print(df["podium"].value_counts(normalize=True))

podium
0    0.848601
1    0.151399
Name: proportion, dtype: float64


In [14]:
# Select features for modeling
model_df = df[[
    "season",
    "round",
    "circuitId",
    "driverId",
    "constructorId",
    "grid",
    "qual_position",
    "podium"
]].copy()

In [15]:
# Save cleaned dataset
model_df.to_csv("clean_podium_dataset.csv", index=False)
print("Clean dataset saved.")

Clean dataset saved.


In [16]:
# Load datasets for feature engineering
import pandas as pd

# Clean dataset (features + target)
df = pd.read_csv("clean_podium_dataset.csv").sort_values(["season","round"])

# Full dataset (has points, podium, etc.)
df_full = pd.read_csv("jolpica_podium_dataset_2022_2025.csv").sort_values(["season","round"])

In [17]:
# Feature: Driver's average points in last 3 races (no leakage)
df_full["driver_points_last3"] = (
    df_full.groupby("driverId")["points"]
      .shift(1)
      .rolling(3, min_periods=1)
      .mean()
)

In [18]:
# Feature: Constructor's average points in last 3 races (no leakage)
df_full["constructor_points_last3"] = (
    df_full.groupby("constructorId")["points"]
      .shift(1)
      .rolling(3, min_periods=1)
      .mean()
)

In [19]:
# Feature: Driver's average podium finishes in last 3 races (no leakage)
df_full["driver_podiums_last3"] = (
    df_full.groupby("driverId")["podium"]
      .shift(1)
      .rolling(3, min_periods=1)
      .mean()
)

In [20]:
# Extract engineered features for merging
engineered = df_full[[
    "season","round","driverId","constructorId",
    "driver_points_last3","constructor_points_last3","driver_podiums_last3"
]].copy()

In [21]:
# Merge engineered features with clean dataset
df2 = df.merge(engineered, on=["season","round","driverId","constructorId"], how="left")

In [22]:
# Fill missing rolling values (early season/first races)
for col in ["driver_points_last3","constructor_points_last3","driver_podiums_last3"]:
    df2[col] = df2[col].fillna(0)

In [23]:
# Build final feature set and save
feature_df = df2[[
    "season",
    "round",
    "circuitId",
    "driverId",
    "constructorId",
    "grid",
    "qual_position",
    "driver_points_last3",
    "constructor_points_last3",
    "driver_podiums_last3",
    "podium"
]].copy()

feature_df.to_csv("feature_engineered_dataset.csv", index=False)
print("Saved feature_engineered_dataset.csv | rows:", len(feature_df), "| cols:", feature_df.shape[1])
feature_df.head()

Saved feature_engineered_dataset.csv | rows: 1823 | cols: 11


Unnamed: 0,season,round,circuitId,driverId,constructorId,grid,qual_position,driver_points_last3,constructor_points_last3,driver_podiums_last3,podium
0,2022,1,bahrain,leclerc,ferrari,1,1.0,0.0,0.0,0.0,1
1,2022,1,bahrain,sainz,ferrari,3,3.0,0.0,26.0,0.0,1
2,2022,1,bahrain,hamilton,mercedes,5,5.0,0.0,26.0,0.0,1
3,2022,1,bahrain,russell,mercedes,9,9.0,0.0,20.5,0.0,0
4,2022,1,bahrain,kevin_magnussen,haas,7,7.0,0.0,15.0,0.0,0


In [24]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [25]:
# Train/test split: 2022-2024 for training, 2025 for testing
df = pd.read_csv("jolpica_podium_dataset_2022_2025.csv")

train_df = df[df["season"].isin([2022, 2023, 2024])].copy()
test_df  = df[df["season"] == 2025].copy()

features = ["season","round","circuitId","constructorId","driverId","grid","qual_position"]
X_train, y_train = train_df[features], train_df["podium"]
X_test,  y_test  = test_df[features],  test_df["podium"]

In [26]:
# Separate categorical and numeric features
cat_cols = ["circuitId","constructorId","driverId"]
num_cols = ["season","round","grid","qual_position"]

# One-hot encode categoricals, pass through numerics
prep = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols),
])

# Build pipeline with preprocessing and logistic regression
model = Pipeline([
    ("prep", prep),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))
])

In [28]:
# Train and evaluate on 2025 season
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_test, proba))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=3))

ROC AUC: 0.9266653016653017
Confusion matrix:
 [[339  68]
 [  6  66]]
              precision    recall  f1-score   support

           0      0.983     0.833     0.902       407
           1      0.493     0.917     0.641        72

    accuracy                          0.846       479
   macro avg      0.738     0.875     0.771       479
weighted avg      0.909     0.846     0.862       479



In [53]:
# Average finishing position over last 3 races
df_full["driver_finishpos_last3"] = (
    df_full.groupby("driverId")["finish_position"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

In [54]:
# Constructor podium rate over last 3 races
df_full["constructor_podiums_last3"] = (
    df_full.groupby("constructorId")["podium"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)


In [55]:
# Grid position inverse (1/grid) to capture non-linear effect of starting position
df_full["grid_inverse"] = 1 / df_full["grid"]

In [56]:
# Fill NaN values in new features (early season/first races)
new_cols = [
    "driver_finishpos_last3",
    "constructor_podiums_last3",
    "grid_inverse"
]

for col in new_cols:
    df_full[col] = df_full[col].fillna(0)

In [57]:
df["driver_finishpos_last3"] = (
    df.groupby("driverId")["finish_position"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df["driver_finishpos_last3"] = df["driver_finishpos_last3"].fillna(0)


In [58]:
# df_full must include: season, round, driverId, constructorId, points, finish_position, podium
df_full = df_full.sort_values(["season", "round"])

# Rolling averages (group-safe) - exclude current race with shift(1)
df_full["driver_points_last3"] = (
    df_full.groupby("driverId")["points"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["constructor_points_last3"] = (
    df_full.groupby("constructorId")["points"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["driver_podiums_last3"] = (
    df_full.groupby("driverId")["podium"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

# âœ… Missing features you asked about
df_full["driver_finishpos_last3"] = (
    df_full.groupby("driverId")["finish_position"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["constructor_podiums_last3"] = (
    df_full.groupby("constructorId")["podium"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["grid_inverse"] = 1 / df_full["grid"]


In [67]:
# Extract the complete feature set from df_full (which has all engineered features)
df = df_full[[
    "season","round","circuitId","driverId","constructorId",
    "grid","qual_position",
    "driver_points_last3","constructor_points_last3","driver_podiums_last3",
    "driver_finishpos_last3","constructor_podiums_last3","grid_inverse",
    "podium"
]].copy()

# Fill NaNs (early races etc.)
fill_cols = [
    "driver_points_last3","constructor_points_last3","driver_podiums_last3",
    "driver_finishpos_last3","constructor_podiums_last3","grid_inverse"
]
df[fill_cols] = df[fill_cols].fillna(0)

print("df shape:", df.shape)
print("df columns:", df.columns.tolist())


df shape: (1838, 14)
df columns: ['season', 'round', 'circuitId', 'driverId', 'constructorId', 'grid', 'qual_position', 'driver_points_last3', 'constructor_points_last3', 'driver_podiums_last3', 'driver_finishpos_last3', 'constructor_podiums_last3', 'grid_inverse', 'podium']


In [68]:
feature_df = df[[
    "season","round","circuitId","driverId","constructorId",
    "grid","qual_position",
    "driver_points_last3","constructor_points_last3","driver_podiums_last3",
    "driver_finishpos_last3","constructor_podiums_last3","grid_inverse",
    "podium"
]].copy()

print("feature_df columns:", feature_df.columns.tolist())
feature_df.to_csv("feature_engineered_dataset.csv", index=False)
print("Saved feature_engineered_dataset.csv | shape:", feature_df.shape)


feature_df columns: ['season', 'round', 'circuitId', 'driverId', 'constructorId', 'grid', 'qual_position', 'driver_points_last3', 'constructor_points_last3', 'driver_podiums_last3', 'driver_finishpos_last3', 'constructor_podiums_last3', 'grid_inverse', 'podium']
Saved feature_engineered_dataset.csv | shape: (1838, 14)


In [69]:
expected = {
 "driver_points_last3","constructor_points_last3","driver_podiums_last3",
 "driver_finishpos_last3","constructor_podiums_last3","grid_inverse"
}

print("Missing:", expected - set(df.columns))
print("Columns containing 'last3':", [c for c in df.columns if "last3" in c])


Missing: set()
Columns containing 'last3': ['driver_points_last3', 'constructor_points_last3', 'driver_podiums_last3', 'driver_finishpos_last3', 'constructor_podiums_last3']


In [70]:
# Select features for final model
feature_df = df_full[[
    "season",
    "round",
    "circuitId",
    "driverId",
    "constructorId",
    "grid",
    "qual_position",
    "driver_points_last3",
    "constructor_points_last3",
    "driver_podiums_last3",
    "driver_finishpos_last3",
    "constructor_podiums_last3",
    "grid_inverse",
    "podium"
]].copy()

In [71]:
print([c for c in df_full.columns if "finish" in c])

['finish_position', 'driver_finishpos_last3']


In [72]:
print(df.columns)
print(df_full.columns)

Index(['season', 'round', 'circuitId', 'driverId', 'constructorId', 'grid',
       'qual_position', 'driver_points_last3', 'constructor_points_last3',
       'driver_podiums_last3', 'driver_finishpos_last3',
       'constructor_podiums_last3', 'grid_inverse', 'podium'],
      dtype='object')
Index(['season', 'round', 'raceName', 'date', 'circuitId', 'circuitName',
       'driverId', 'driverCode', 'driverGivenName', 'driverFamilyName',
       'constructorId', 'constructorName', 'grid', 'finish_position', 'points',
       'laps', 'status', 'qual_position', 'podium', 'driver_points_last3',
       'constructor_points_last3', 'driver_podiums_last3',
       'driver_finishpos_last3', 'constructor_podiums_last3', 'grid_inverse'],
      dtype='object')
