In [1]:
import time
import requests
import pandas as pd

# Use Jolpica F1 API (Ergast-compatible endpoint)
BASE = "https://api.jolpi.ca/ergast/f1"
SEASONS = [2022, 2023, 2024, 2025]

# F1 Podium Predictor

## Overview
This notebook demonstrates a complete ML pipeline for predicting Formula 1 podium finishes:
- **Data Collection**: Fetch F1 race results from Jolpica API
- **Data Cleaning**: Handle missing values and remove leakage
- **Feature Engineering**: Develop rolling performance metrics
- **Baseline Model**: Logistic Regression on raw features
- **Final Model**: Gradient Boosting on engineered features
- **Evaluation**: Threshold tuning and feature importance analysis

---

## Part 1: Data Collection

In [2]:
def get_json(url: str, params=None, sleep_s: float = 0.35, max_retries: int = 8) -> dict:
    """Fetch JSON with retry/backoff for rate limiting and errors"""
    backoff = 1.0
    for attempt in range(1, max_retries + 1):
        r = requests.get(
            url,
            params=params,
            timeout=30,
            headers={"Accept": "application/json", "User-Agent": "F1-Podium-Predictor/1.0"}
        )

        # Handle rate limiting
        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            wait_s = float(retry_after) if retry_after and retry_after.isdigit() else backoff
            print(f"429 rate limit: waiting {wait_s:.1f}s (attempt {attempt}/{max_retries}) -> {r.url}")
            time.sleep(wait_s)
            backoff = min(backoff * 2, 30)
            continue

        r.raise_for_status()

        # Validate JSON response
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "json" not in ctype:
            preview = r.text[:200].replace("\n", " ")
            raise ValueError(f"Non-JSON response from {r.url} | Content-Type={ctype} | Preview={preview}")

        time.sleep(sleep_s)
        return r.json()

    raise RuntimeError(f"Failed after {max_retries} retries (last status {r.status_code}) for {url}")

In [3]:
def get_rounds_in_season(season: int) -> list[int]:
    """Returns list of round numbers in a season"""
    url = f"{BASE}/{season}.json"
    data = get_json(url)
    races = data["MRData"]["RaceTable"].get("Races", [])
    return [int(r["round"]) for r in races]

In [4]:
def get_race_results(season: int, round_no: int) -> list[dict]:
    """Returns one row per driver for a race"""
    url = f"{BASE}/{season}/{round_no}/results.json"
    data = get_json(url)
    races = data["MRData"]["RaceTable"].get("Races", [])
    if not races:
        return []
    race = races[0]
    circuit = race["Circuit"]
    results = race.get("Results", [])

    rows = []
    for res in results:
        driver = res["Driver"]
        constructor = res["Constructor"]

        # Handle missing or non-numeric finish positions
        try:
            finish_pos = int(res.get("position"))
        except (TypeError, ValueError):
            finish_pos = None

        rows.append({
            "season": int(season),
            "round": int(round_no),
            "raceName": race.get("raceName"),
            "date": race.get("date"),
            "circuitId": circuit.get("circuitId"),
            "circuitName": circuit.get("circuitName"),
            "driverId": driver.get("driverId"),
            "driverCode": driver.get("code"),
            "driverGivenName": driver.get("givenName"),
            "driverFamilyName": driver.get("familyName"),
            "constructorId": constructor.get("constructorId"),
            "constructorName": constructor.get("name"),
            "grid": int(res["grid"]) if res.get("grid") not in (None, "") else None,
            "finish_position": finish_pos,
            "points": float(res["points"]) if res.get("points") not in (None, "") else None,
            "laps": int(res["laps"]) if res.get("laps") not in (None, "") else None,
            "status": res.get("status"),
        })
    return rows

In [5]:
def get_qualifying(season: int, round_no: int) -> dict:
    """Returns dict mapping driverId to qualifying position"""
    url = f"{BASE}/{season}/{round_no}/qualifying.json"
    data = get_json(url)
    races = data["MRData"]["RaceTable"].get("Races", [])
    if not races:
        return {}
    q = races[0].get("QualifyingResults", [])
    out = {}
    for row in q:
        try:
            out[row["Driver"]["driverId"]] = int(row["position"])
        except Exception:
            pass
    return out

In [6]:
all_rows = []

for season in SEASONS:
    rounds = get_rounds_in_season(season)
    print(f"Season {season} rounds found: {len(rounds)}")

    for rnd in rounds:
        rows = get_race_results(season, rnd)
        if not rows:
            continue

        # Get qualifying results (may be missing for some races)
        try:
            q_map = get_qualifying(season, rnd)
        except Exception as e:
            print(f"Qualifying unavailable for {season} round {rnd}: {e}")
            q_map = {}

        # Add qualifying position and podium label
        for r in rows:
            r["qual_position"] = q_map.get(r["driverId"])
            r["podium"] = 1 if (r["finish_position"] is not None and r["finish_position"] <= 3) else 0

        all_rows.extend(rows)

if not all_rows:
    raise RuntimeError("No rows collected. Check internet access and that the API is reachable.")

Season 2022 rounds found: 22
Season 2023 rounds found: 22
Season 2024 rounds found: 24
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/1/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/2/results.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/2/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/3/results.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/3/qualifying.json
429 rate limit: waiting 2.0s (attempt 2/8) -> https://api.jolpi.ca/ergast/f1/2024/3/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/4/qualifying.json
429 rate limit: waiting 2.0s (attempt 2/8) -> https://api.jolpi.ca/ergast/f1/2024/4/qualifying.json
429 rate limit: waiting 1.0s (attempt 1/8) -> https://api.jolpi.ca/ergast/f1/2024/5/qualifying.json
429 rate limit: wai

In [7]:
# Build dataframe and handle missing data
df_raw = pd.DataFrame(all_rows)

df_raw = df_raw.dropna(subset=["grid", "finish_position"]).copy()
df_raw["qual_position"] = df_raw["qual_position"].fillna(df_raw["grid"])

# Save dataset
out_path = "jolpica_podium_dataset_2022_2025.csv"
df_raw.to_csv(out_path, index=False)
print("Saved:", out_path, "rows:", len(df_raw))

Saved: jolpica_podium_dataset_2022_2025.csv rows: 1838


In [8]:
# Check for missing values
print(df_raw.isna().sum().sort_values(ascending=False))

season              0
round               0
raceName            0
date                0
circuitId           0
circuitName         0
driverId            0
driverCode          0
driverGivenName     0
driverFamilyName    0
constructorId       0
constructorName     0
grid                0
finish_position     0
points              0
laps                0
status              0
qual_position       0
podium              0
dtype: int64


## Part 2: Data Cleaning & Preprocessing

Remove rows with missing values, filter invalid positions, and prepare dataset for modeling.

In [9]:
# Convert numeric columns to proper types
numeric_cols = ["season", "round", "grid", "qual_position", "finish_position", "points", "laps"]

for col in numeric_cols:
    if col in df_raw.columns:
        df_raw[col] = pd.to_numeric(df_raw[col], errors="coerce")

In [10]:
# Filter out invalid positions (DNS, DNF, disqualifications)
df_raw = df_raw[df_raw["grid"].between(1, 20)]
df_raw = df_raw[df_raw["finish_position"].between(1, 20)]

In [11]:
# Check class balance of target variable
print(df_raw["podium"].value_counts(normalize=True))

podium
0    0.848601
1    0.151399
Name: proportion, dtype: float64


In [12]:
# Select features for modeling
model_df = df_raw[[
    "season",
    "round",
    "circuitId",
    "driverId",
    "constructorId",
    "grid",
    "qual_position",
    "podium"
]].copy()

In [13]:
# Save cleaned dataset
model_df.to_csv("clean_podium_dataset.csv", index=False)
print("Clean dataset saved.")

Clean dataset saved.


In [14]:
# Load dataset for feature engineering
df_full = pd.read_csv("jolpica_podium_dataset_2022_2025.csv").sort_values(["season","round"])

## Part 3: Feature Engineering

Create rolling performance metrics and momentum features to capture historical context.

In [15]:
# Rolling averages (group-safe) - exclude current race with shift(1)
df_full["driver_points_last3"] = (
    df_full.groupby("driverId")["points"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["constructor_points_last3"] = (
    df_full.groupby("constructorId")["points"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["driver_podiums_last3"] = (
    df_full.groupby("driverId")["podium"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["driver_finishpos_last3"] = (
    df_full.groupby("driverId")["finish_position"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["constructor_podiums_last3"] = (
    df_full.groupby("constructorId")["podium"]
    .apply(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)

df_full["grid_inverse"] = 1 / df_full["grid"]

In [16]:
# Extract the complete feature set from df_full
df_feat = df_full[[
    "season","round","circuitId","driverId","constructorId",
    "grid","qual_position",
    "driver_points_last3","constructor_points_last3","driver_podiums_last3",
    "driver_finishpos_last3","constructor_podiums_last3","grid_inverse",
    "podium"
]].copy()

# Fill NaNs (early races etc.)
fill_cols = [
    "driver_points_last3","constructor_points_last3","driver_podiums_last3",
    "driver_finishpos_last3","constructor_podiums_last3","grid_inverse"
]
df_feat[fill_cols] = df_feat[fill_cols].fillna(0)

print("df_feat shape:", df_feat.shape)
print("df_feat columns:", df_feat.columns.tolist())

# Save feature-engineered dataset
df_feat.to_csv("feature_engineered_dataset.csv", index=False)
print("Saved feature_engineered_dataset.csv")

df_feat shape: (1838, 14)
df_feat columns: ['season', 'round', 'circuitId', 'driverId', 'constructorId', 'grid', 'qual_position', 'driver_points_last3', 'constructor_points_last3', 'driver_podiums_last3', 'driver_finishpos_last3', 'constructor_podiums_last3', 'grid_inverse', 'podium']
Saved feature_engineered_dataset.csv


Baseline Model (Raw Predictors)
Logistic Regression trained using basic race features without rolling performance metrics.
Purpose: Establish benchmark performance.

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np

## Part 4: Baseline Model — Logistic Regression (Raw Features)

**Purpose**: Establish benchmark performance using basic race features (grid, qualifying position, team).

**Hypothesis**: Raw features alone provide limited predictive power for podium finishes.

This baseline demonstrates the *controlled experiment*: same data split (2022-2024 train, 2025 test), same evaluation metrics. It justifies the value of engineered features by direct comparison.

In [18]:
# Train/test split: 2022-2024 for training, 2025 for testing
df_clean = pd.read_csv("jolpica_podium_dataset_2022_2025.csv")

train_df = df_clean[df_clean["season"].isin([2022, 2023, 2024])].copy()
test_df  = df_clean[df_clean["season"] == 2025].copy()

In [19]:
# Separate categorical and numeric features
cat_cols = ["circuitId","constructorId","driverId"]
num_cols = ["season","round","grid","qual_position"]

# One-hot encode categoricals, pass through numerics
prep = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols),
])

# Build pipeline with preprocessing and logistic regression
model = Pipeline([
    ("prep", prep),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))
])

# Train and evaluate baseline model
features = num_cols + cat_cols
X_train, y_train = train_df[features], train_df["podium"]
X_test,  y_test  = test_df[features],  test_df["podium"]

model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]

print("Baseline Model (Logistic Regression) - Evaluating thresholds...")
print("=" * 50)

Baseline Model (Logistic Regression) - Evaluating thresholds...


In [20]:
thresholds = np.arange(0.1, 0.9, 0.05)

results = []

for t in thresholds:
    pred_t = (proba >= t).astype(int)

    precision = precision_score(y_test, pred_t)
    recall = recall_score(y_test, pred_t)
    f1 = f1_score(y_test, pred_t)

    results.append((t, precision, recall, f1))

for r in results:
    print(f"Threshold={r[0]:.2f} | Precision={r[1]:.3f} | Recall={r[2]:.3f} | F1={r[3]:.3f}")

Threshold=0.10 | Precision=0.327 | Recall=0.972 | F1=0.490
Threshold=0.15 | Precision=0.345 | Recall=0.972 | F1=0.509
Threshold=0.20 | Precision=0.375 | Recall=0.958 | F1=0.539
Threshold=0.25 | Precision=0.406 | Recall=0.958 | F1=0.570
Threshold=0.30 | Precision=0.422 | Recall=0.944 | F1=0.584
Threshold=0.35 | Precision=0.429 | Recall=0.931 | F1=0.588
Threshold=0.40 | Precision=0.456 | Recall=0.931 | F1=0.612
Threshold=0.45 | Precision=0.475 | Recall=0.917 | F1=0.626
Threshold=0.50 | Precision=0.493 | Recall=0.917 | F1=0.641
Threshold=0.55 | Precision=0.528 | Recall=0.903 | F1=0.667
Threshold=0.60 | Precision=0.542 | Recall=0.889 | F1=0.674
Threshold=0.65 | Precision=0.570 | Recall=0.847 | F1=0.682
Threshold=0.70 | Precision=0.613 | Recall=0.792 | F1=0.691
Threshold=0.75 | Precision=0.684 | Recall=0.750 | F1=0.715
Threshold=0.80 | Precision=0.716 | Recall=0.667 | F1=0.691
Threshold=0.85 | Precision=0.688 | Recall=0.458 | F1=0.550


In [21]:
best_t = 0.75
pred_best = (proba >= best_t).astype(int)

from sklearn.metrics import confusion_matrix, classification_report

print("Confusion matrix (t=0.75):")
print(confusion_matrix(y_test, pred_best))

print("\nClassification report:")
print(classification_report(y_test, pred_best, digits=3))

Confusion matrix (t=0.75):
[[382  25]
 [ 18  54]]

Classification report:
              precision    recall  f1-score   support

           0      0.955     0.939     0.947       407
           1      0.684     0.750     0.715        72

    accuracy                          0.910       479
   macro avg      0.819     0.844     0.831       479
weighted avg      0.914     0.910     0.912       479



Final Model (Feature-Engineered Predictors)
Gradient Boosting classifier trained using rolling performance and momentum features.
Purpose: Evaluate predictive gains from feature engineering.


In [22]:
# Import additional classifier
from sklearn.ensemble import GradientBoostingClassifier

# Load engineered features
df_feat = pd.read_csv("feature_engineered_dataset.csv")

# Time-aware split
train_df_feat = df_feat[df_feat["season"].isin([2022, 2023, 2024])].copy()
test_df_feat  = df_feat[df_feat["season"] == 2025].copy()

# Extract features
target = "podium"
features_eng = [c for c in df_feat.columns if c != target]

X_train_eng, y_train_eng = train_df_feat[features_eng], train_df_feat[target]
X_test_eng,  y_test_eng  = test_df_feat[features_eng],  test_df_feat[target]

# Handle infinite values
X_train_eng = X_train_eng.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test_eng = X_test_eng.replace([np.inf, -np.inf], np.nan).fillna(0)

# Categorical + numeric columns
cat_cols_eng = ["circuitId", "constructorId", "driverId"]
num_cols_eng = [c for c in features_eng if c not in cat_cols_eng]

# Train GradientBoostingClassifier (use engineered features)
gb_model = Pipeline([
    ("prep", ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_eng),
        ("num", "passthrough", num_cols_eng),
    ])),
    ("clf", GradientBoostingClassifier(random_state=42, n_estimators=100))
])

gb_model.fit(X_train_eng, y_train_eng)

# Get predictions
proba = gb_model.predict_proba(X_test_eng)[:, 1]

# Find best threshold
def best_threshold(y_true, proba):
    thresholds = np.arange(0.10, 0.91, 0.05)
    best = (0.50, -1, None)  # (t, best_f1, (prec, rec))
    for t in thresholds:
        pred = (proba >= t).astype(int)
        p = precision_score(y_true, pred, zero_division=0)
        r = recall_score(y_true, pred, zero_division=0)
        f = f1_score(y_true, pred, zero_division=0)
        if f > best[1]:
            best = (t, f, (p, r))
    return best

best_t, best_f1, (best_p, best_r) = best_threshold(y_test_eng, proba)
pred_best = (proba >= best_t).astype(int)

# Results
auc = roc_auc_score(y_test_eng, proba)
cm = confusion_matrix(y_test_eng, pred_best)

print("\nGradientBoostingClassifier Results")
print("=" * 50)
print(f"ROC AUC: {auc:.3f}")
print(f"Best Threshold (F1): {best_t:.2f}")
print(f"Precision: {best_p:.3f} | Recall: {best_r:.3f} | F1: {best_f1:.3f}")
print("\nConfusion Matrix:")
print(cm)


GradientBoostingClassifier Results
ROC AUC: 0.955
Best Threshold (F1): 0.30
Precision: 0.678 | Recall: 0.847 | F1: 0.753

Confusion Matrix:
[[378  29]
 [ 11  61]]


In [71]:
import joblib

joblib.dump(gb_model, "final_gb_model.joblib")

print("Saved final_gb_model.joblib")

Saved final_gb_model.joblib


## Part 5: Final Model — Gradient Boosting (Engineered Features)

**Purpose**: Evaluate performance improvements using domain-informed rolling performance metrics.

**Hypothesis**: Rolling features (driver/team points last 3 races, podium rate, finish position trends) provide significant predictive power.

**Outcome**: Compare ROC AUC and optimal threshold metrics against baseline to quantify feature engineering value.

In [23]:
# Get feature names after preprocessing
ohe = gb_model.named_steps["prep"].named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_cols_eng)

all_feature_names = np.concatenate([cat_feature_names, num_cols_eng])

# Get importance values
importances = gb_model.named_steps["clf"].feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

feature_importance_df.head(15)

Unnamed: 0,feature,importance
74,grid_inverse,0.376674
68,qual_position,0.193228
69,driver_points_last3,0.093352
70,constructor_points_last3,0.061198
72,driver_finishpos_last3,0.042868
67,grid,0.035511
51,driverId_max_verstappen,0.019312
73,constructor_podiums_last3,0.019216
66,round,0.015974
71,driver_podiums_last3,0.014931


## Part 6: Interpretability & Feature Importance

Analyze which features drive the model's predictions. This demonstrates that the engineered rolling metrics are indeed the most impactful predictors.

In [24]:
def map_feature_group(f):
    if f.startswith("driverId_"):
        return "Driver identity"
    if f.startswith("constructorId_"):
        return "Constructor identity"
    if f.startswith("circuitId_"):
        return "Circuit"
    if f == "grid":
        return "Grid position"
    if f == "qual_position":
        return "Qualifying position"
    if f == "driver_points_last3":
        return "Driver recent points"
    if f == "constructor_points_last3":
        return "Constructor recent points"
    if f == "driver_podiums_last3":
        return "Driver recent podiums"
    if f == "driver_finishpos_last3":
        return "Driver finish consistency"
    if f == "constructor_podiums_last3":
        return "Constructor recent podiums"
    if f == "grid_inverse":
        return "Grid advantage (inverse)"
    return "Other"

feature_importance_df["group"] = feature_importance_df["feature"].apply(map_feature_group)

grouped_importance = (
    feature_importance_df
    .groupby("group")["importance"]
    .sum()
    .sort_values(ascending=False)
)

grouped_importance


group
Grid advantage (inverse)      0.376674
Qualifying position           0.193228
Driver recent points          0.093352
Circuit                       0.075027
Constructor recent points     0.061198
Driver identity               0.056146
Driver finish consistency     0.042868
Grid position                 0.035511
Other                         0.023078
Constructor recent podiums    0.019216
Driver recent podiums         0.014931
Constructor identity          0.008771
Name: importance, dtype: float64

In [None]:
# Driver form Signal
def driver_form_signal(row):
    if row["round"] <= 3:
        return "UNKNOWN"
    if row["driver_points_last3"] >= 18:
        return "STRONG"
    elif row["driver_points_last3"] >= 8:
        return "MODERATE"
    else:
        return "WEAK"

In [None]:
# Constructor momentum Signal
def constructor_momentum_signal(row):
    if row["round"] <= 3:
        return "UNKNOWN"
    if row["constructor_podiums_last3"] >= 0.5:
        return "HIGH"
    elif row["constructor_podiums_last3"] >= 0.2:
        return "MEDIUM"
    else:
        return "LOW"

In [None]:
# Grid advantage Signal
def grid_advantage_signal(row):
    if row["grid"] <= 3:
        return "FRONT"
    elif row["grid"] <= 10:
        return "MIDFIELD"
    else:
        return "BACK"

In [None]:
# Driver consistency Signal
def consistency_signal(row):
    if row["round"] <= 3:
        return "UNKNOWN"
    if row["driver_finishpos_last3"] <= 4:
        return "HIGH"
    elif row["driver_finishpos_last3"] <= 9:
        return "MEDIUM"
    else:
        return "LOW"

In [52]:
def build_signals(row):
    return {
        "driver_form": driver_form_signal(row),
        "constructor_momentum": constructor_momentum_signal(row),
        "grid_positioning": grid_advantage_signal(row),
        "consistency": consistency_signal(row)
    }

In [53]:
# Sample for signal extraction
df_feat.iloc[5]

season                           2022
round                               1
circuitId                     bahrain
driverId                       bottas
constructorId                    alfa
grid                                6
qual_position                     6.0
driver_points_last3               0.0
constructor_points_last3          0.0
driver_podiums_last3              0.0
driver_finishpos_last3            0.0
constructor_podiums_last3         0.0
grid_inverse                 0.166667
podium                              0
Name: 5, dtype: object

In [None]:
# Mid season row testing
df_feat[(df_feat["season"] == 2023) & (df_feat["round"] == 10)].iloc[0]

season                                 2023
round                                    10
circuitId                       silverstone
driverId                     max_verstappen
constructorId                      red_bull
grid                                      1
qual_position                           1.0
driver_points_last3               25.666667
constructor_points_last3          16.666667
driver_podiums_last3                    1.0
driver_finishpos_last3                  1.0
constructor_podiums_last3          0.666667
grid_inverse                            1.0
podium                                    1
Name: 620, dtype: object

In [55]:
def build_signal_pack(row, podium_proba: float, threshold: float = 0.30):
    # row can be a pandas Series
    signals = {
        "driver_form": driver_form_signal(row),
        "constructor_momentum": constructor_momentum_signal(row),
        "grid_positioning": grid_advantage_signal(row),
        "consistency": consistency_signal(row),
    }

    facts = {
        "season": int(row["season"]),
        "round": int(row["round"]),
        "circuitId": str(row["circuitId"]),
        "driverId": str(row["driverId"]),
        "constructorId": str(row["constructorId"]),
        "grid": int(row["grid"]),
        "qual_position": int(float(row["qual_position"])),
        "driver_points_last3": float(row["driver_points_last3"]),
        "constructor_points_last3": float(row["constructor_points_last3"]),
        "driver_podiums_last3": float(row["driver_podiums_last3"]),
        "driver_finishpos_last3": float(row["driver_finishpos_last3"]),
        "constructor_podiums_last3": float(row["constructor_podiums_last3"]),
    }

    decision = "PODIUM_LIKELY" if podium_proba >= threshold else "PODIUM_UNLIKELY"

    return {
        "podium_probability": float(podium_proba),
        "decision_threshold": float(threshold),
        "decision": decision,
        "signals": signals,
        "facts": facts,
    }

In [56]:
import json

def make_llm_prompt(signal_pack: dict) -> str:
    return f"""
You are generating a short Formula 1 race prediction explanation for a user.

Rules:
- Use ONLY the provided JSON information.
- Do NOT invent driver traits, weather, tyre strategy, or real-world events.
- If some signals are UNKNOWN, mention that it is early-season or limited recent data.
- Output must be 3-5 sentences, clear and descriptive, not bullet points.

JSON:
{json.dumps(signal_pack, indent=2)}

Write the explanation now.
""".strip()

In [57]:
row = df_feat[(df_feat["season"] == 2023) & (df_feat["round"] == 10)].iloc[0]

# Example: pretend your model predicted 0.87
signal_pack = build_signal_pack(row, podium_proba=0.87, threshold=0.30)
prompt = make_llm_prompt(signal_pack)
print(prompt)

You are generating a short Formula 1 race prediction explanation for a user.

Rules:
- Use ONLY the provided JSON information.
- Do NOT invent driver traits, weather, tyre strategy, or real-world events.
- If some signals are UNKNOWN, mention that it is early-season or limited recent data.
- Output must be 3-5 sentences, clear and descriptive, not bullet points.

JSON:
{
  "podium_probability": 0.87,
  "decision_threshold": 0.3,
  "decision": "PODIUM_LIKELY",
  "signals": {
    "driver_form": "STRONG",
    "constructor_momentum": "HIGH",
    "grid_positioning": "FRONT",
    "consistency": "HIGH"
  },
  "facts": {
    "season": 2023,
    "round": 10,
    "circuitId": "silverstone",
    "driverId": "max_verstappen",
    "constructorId": "red_bull",
    "grid": 1,
    "qual_position": 1,
    "driver_points_last3": 25.666666666666668,
    "constructor_points_last3": 16.666666666666668,
    "driver_podiums_last3": 1.0,
    "driver_finishpos_last3": 1.0,
    "constructor_podiums_last3": 0.66

In [59]:
def fallback_explanation(signal_pack: dict) -> str:
    s = signal_pack["signals"]
    f = signal_pack["facts"]
    p = signal_pack["podium_probability"]

    parts = []
    parts.append(f"For {f['driverId']} at {f['circuitId']}, the model estimates a podium probability of {p:.2f}.")
    parts.append(f"Starting position signals are {s['grid_positioning']} (grid {f['grid']}, qualifying {f['qual_position']}).")
    parts.append(f"Recent form is {s['driver_form']} and consistency is {s['consistency']}, with constructor momentum {s['constructor_momentum']}.")
    parts.append(f"Overall decision: {signal_pack['decision'].replace('_', ' ').lower()} based on these indicators.")
    return " ".join(parts)

In [60]:
print(fallback_explanation(signal_pack))

For max_verstappen at silverstone, the model estimates a podium probability of 0.87. Starting position signals are FRONT (grid 1, qualifying 1). Recent form is STRONG and consistency is HIGH, with constructor momentum HIGH. Overall decision: podium likely based on these indicators.


In [61]:
def generate_prediction_output(row, podium_proba, threshold=0.30):

    signals = {
        "driver_form": driver_form_signal(row),
        "constructor_momentum": constructor_momentum_signal(row),
        "grid_positioning": grid_advantage_signal(row),
        "consistency": consistency_signal(row),
    }

    decision = "PODIUM_LIKELY" if podium_proba >= threshold else "PODIUM_UNLIKELY"

    # Confidence logic (VERY useful for UI)
    if podium_proba >= 0.75 or podium_proba <= 0.15:
        confidence = "HIGH"
    elif podium_proba >= 0.55 or podium_proba <= 0.30:
        confidence = "MEDIUM"
    else:
        confidence = "LOW"

    facts = {
        "grid": int(row["grid"]),
        "qual_position": int(float(row["qual_position"])),
        "driver_points_last3": float(row["driver_points_last3"]),
        "constructor_points_last3": float(row["constructor_points_last3"]),
        "driver_podiums_last3": float(row["driver_podiums_last3"]),
        "driver_finishpos_last3": float(row["driver_finishpos_last3"]),
        "constructor_podiums_last3": float(row["constructor_podiums_last3"]),
    }

    reasons = []

    if signals["grid_positioning"] == "FRONT":
        reasons.append("strong starting position advantage")

    if signals["driver_form"] == "STRONG":
        reasons.append("strong recent driver performance")

    if signals["constructor_momentum"] == "HIGH":
        reasons.append("strong recent constructor momentum")

    if signals["consistency"] == "HIGH":
        reasons.append("high recent finishing consistency")

    if not reasons:
        reasons.append("no strong performance signals detected")

    summary = (
        f"Podium probability {podium_proba:.2f}. "
        f"Driver form {signals['driver_form']}, "
        f"constructor momentum {signals['constructor_momentum']}, "
        f"grid positioning {signals['grid_positioning']}, "
        f"consistency {signals['consistency']}."
    )

    return {
        "probability": float(podium_proba),
        "decision": decision,
        "confidence_level": confidence,
        "signals": signals,
        "facts": facts,
        "reasons": reasons,
        "summary": summary,
    }

In [62]:
row = df_feat[(df_feat["season"] == 2023) & (df_feat["round"] == 10)].iloc[0]

podium_proba = 0.87   # normally from model.predict_proba()

output = generate_prediction_output(row, podium_proba)

import json
print(json.dumps(output, indent=2))

{
  "probability": 0.87,
  "decision": "PODIUM_LIKELY",
  "confidence_level": "HIGH",
  "signals": {
    "driver_form": "STRONG",
    "constructor_momentum": "HIGH",
    "grid_positioning": "FRONT",
    "consistency": "HIGH"
  },
  "facts": {
    "grid": 1,
    "qual_position": 1,
    "driver_points_last3": 25.666666666666668,
    "constructor_points_last3": 16.666666666666668,
    "driver_podiums_last3": 1.0,
    "driver_finishpos_last3": 1.0,
    "constructor_podiums_last3": 0.6666666666666666
  },
  "reasons": [
    "strong starting position advantage",
    "strong recent driver performance",
    "strong recent constructor momentum",
    "high recent finishing consistency"
  ],
  "summary": "Podium probability 0.87. Driver form STRONG, constructor momentum HIGH, grid positioning FRONT, consistency HIGH."
}


In [None]:
import json

# LLM prompt builder
def build_llm_prompt(pred_output: dict) -> str:
    return f"""
You are an assistant that writes a short Formula 1 podium prediction explanation.

Rules:
- Use ONLY the information in the JSON.
- Do NOT invent weather, tyre strategy, crashes, penalties, team orders, or driver personality traits.
- Keep it 3–5 sentences.
- Mention the decision, probability, confidence level, and 2–4 reasons.
- If signals contain UNKNOWN, say limited recent data is available.

JSON:
{json.dumps(pred_output, indent=2)}

Write the explanation now.
""".strip()

In [65]:
# Plug-in Interface
def add_llm_explanation(pred_output: dict, llm_generate_fn) -> dict:
    """
    llm_generate_fn: function(prompt: str) -> str
    """
    prompt = build_llm_prompt(pred_output)
    text = llm_generate_fn(prompt).strip()
    pred_output = dict(pred_output)  # copy
    pred_output["llm_explanation"] = text
    pred_output["llm_prompt_used"] = prompt  # optional: keep for debugging
    return pred_output

In [66]:
def mock_llm(prompt: str) -> str:
    # Very basic mock: pulls from JSON in the prompt? We'll just return a placeholder.
    return "Mock explanation: This driver shows strong indicators based on the provided signals and recent performance metrics."

output_with_text = add_llm_explanation(output, mock_llm)
print(output_with_text["llm_explanation"])

Mock explanation: This driver shows strong indicators based on the provided signals and recent performance metrics.


In [67]:
# Later for connecting to real LLM API (e.g. OpenAI, Azure, etc.)
def real_llm(prompt: str) -> str:
    # TODO: call your chosen LLM provider here and return the generated text
    # return response_text
    raise NotImplementedError

In [68]:
def validate_llm_text(text: str) -> str:
    banned = ["weather", "rain", "crash", "penalty", "pit", "tyre", "safety car"]
    lower = text.lower()
    if any(b in lower for b in banned):
        # fallback to deterministic summary
        return None
    return text

def add_llm_explanation_safe(pred_output: dict, llm_generate_fn) -> dict:
    prompt = build_llm_prompt(pred_output)
    text = llm_generate_fn(prompt).strip()
    valid = validate_llm_text(text)
    pred_output = dict(pred_output)
    pred_output["llm_explanation"] = valid if valid else pred_output["summary"]
    return pred_output

In [72]:
X = pd.DataFrame([row.drop("podium").to_dict()])