# Formula 1 2026 Prediction Model - Validation Report

Backtest analysis of 2025 season predictions to measure model accuracy and calibration.

**Key Metrics:**
- Mean Absolute Error (MAE) for position predictions
- Calibration curves (are 70% predictions right 70% of the time?)
- Top-3 accuracy
- Winner prediction rate

**Note:** 2025 season complete. 2026 pre-season testing starts February 2026.

In [None]:
import sys
from pathlib import Path

import fastf1 as ff1
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path
sys.path.insert(0, str(Path.cwd()))

from src.models.bayesian import BayesianDriverRanking
from src.models.priors_factory import PriorsFactory
from src.predictors.qualifying import QualifyingPredictor
from src.predictors.race import RacePredictor
from src.utils.performance_tracker import PerformanceTracker

## 1. Data Collection

Get 2025 season results for comparison.

In [None]:
# Load 2025 schedule
ff1.Cache.enable_cache('~/.fastf1_cache')
schedule_2025 = ff1.get_event_schedule(2025)

# Filter to completed races (exclude testing)
races = schedule_2025[
    (schedule_2025['EventFormat'] != 'testing') & 
    (schedule_2025['EventName'].notna())
].head(10)  # First 10 races for validation

print(f"Validating against {len(races)} races from 2025 season")
races[['RoundNumber', 'EventName', 'Country']].head()

## 2. Model Initialization

Set up prediction system with 2025 baseline data.

In [None]:
# Initialize components
factory = PriorsFactory()
priors = factory.create_priors()
ranker = BayesianDriverRanking(priors)
tracker = PerformanceTracker()

quali_predictor = QualifyingPredictor(
    driver_ranker=ranker,
    performance_tracker=tracker
)

race_predictor = RacePredictor(
    year=2025,
    driver_chars=factory.drivers,
    driver_chars_path=factory.driver_file,
    performance_tracker=tracker
)

print("Model initialized with 2025 baseline characteristics")

## 3. Run Predictions vs Actuals

For each race:
1. Predict qualifying and race
2. Compare with actual results
3. Update Bayesian beliefs

In [None]:
results = []

for _, event in races.iterrows():
    race_name = event['EventName']
    round_num = event['RoundNumber']
    
    try:
        print(f"\n{'='*60}")
        print(f"Round {round_num}: {race_name}")
        print('='*60)
        
        # Get actual results
        session = ff1.get_session(2025, race_name, 'R')
        session.load(laps=False, telemetry=False)
        
        actual_results = session.results[['Abbreviation', 'Position']].dropna()
        actual_dict = dict(zip(actual_results['Abbreviation'], actual_results['Position'], strict=False))
        
        # Get qualifying grid
        quali_session = ff1.get_session(2025, race_name, 'Q')
        quali_session.load(laps=False, telemetry=False)
        quali_results = quali_session.results[['Abbreviation', 'Position', 'TeamName']].dropna()
        
        grid = [
            {'driver': row['Abbreviation'], 'team': row['TeamName'], 'position': row['Position']}
            for _, row in quali_results.iterrows()
        ]
        
        # Predict race
        prediction = race_predictor.predict(
            year=2025,
            race_name=race_name,
            qualifying_grid=grid,
            verbose=False
        )
        
        # Calculate errors
        for pred in prediction['finish_order']:
            driver = pred['driver']
            if driver in actual_dict:
                results.append({
                    'race': race_name,
                    'round': round_num,
                    'driver': driver,
                    'predicted_pos': pred['position'],
                    'actual_pos': actual_dict[driver],
                    'error': abs(pred['position'] - actual_dict[driver]),
                    'confidence': pred['confidence'],
                    'podium_prob': pred['podium_probability'],
                    'actual_podium': 1 if actual_dict[driver] <= 3 else 0
                })
        
        # Show summary
        winner_pred = prediction['finish_order'][0]['driver']
        winner_actual = actual_results.iloc[0]['Abbreviation']
        mae = np.mean([r['error'] for r in results if r['race'] == race_name])
        
        print(f"Winner: {winner_pred} (predicted) vs {winner_actual} (actual)")
        print(f"MAE: {mae:.2f} positions")
        
        # Update Bayesian beliefs
        observations = {row['DriverNumber']: row['Position'] 
                       for _, row in session.results.dropna(subset=['Position']).iterrows()}
        ranker.update(observations, race_name, confidence=1.0)
        
    except Exception as e:
        print(f"Error processing {race_name}: {e}")
        continue

df_results = pd.DataFrame(results)
print(f"\nCollected {len(df_results)} predictions across {df_results['race'].nunique()} races")

## 4. Overall Accuracy Metrics

In [None]:
print("=" * 60)
print("VALIDATION METRICS - 2025 SEASON")
print("=" * 60)

mae = df_results['error'].mean()
print(f"\nMean Absolute Error: {mae:.2f} positions")

median_error = df_results['error'].median()
print(f"Median Error: {median_error:.2f} positions")

winner_correct = sum(1 for _, g in df_results.groupby('race') 
                     if g[g['predicted_pos'] == 1].iloc[0]['actual_pos'] == 1)
winner_accuracy = winner_correct / df_results['race'].nunique() * 100
print(f"\nWinner Prediction Accuracy: {winner_accuracy:.1f}%")

top3_pred = df_results[df_results['predicted_pos'] <= 3]
top3_accuracy = (top3_pred['actual_pos'] <= 3).mean() * 100
print(f"Top-3 Accuracy: {top3_accuracy:.1f}%")

perfect = (df_results['error'] == 0).mean() * 100
print(f"\nPerfect Predictions: {perfect:.1f}%")

within_2 = (df_results['error'] <= 2).mean() * 100
print(f"Within 2 Positions: {within_2:.1f}%")

## 5. Error Distribution

In [None]:
# Create subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Distribution of Prediction Errors', 'Error by Position Group')
)

# Histogram
fig.add_trace(
    go.Histogram(x=df_results['error'], nbinsx=20, name='Error Distribution'),
    row=1, col=1
)
fig.add_vline(x=mae, line_dash="dash", line_color="red", 
              annotation_text=f"MAE = {mae:.2f}", row=1, col=1)

# Box plot by position
df_results['position_bin'] = pd.cut(df_results['predicted_pos'], 
                                     bins=[0, 5, 10, 20], 
                                     labels=['Top 5', 'P6-10', 'P11-20'])

for group in ['Top 5', 'P6-10', 'P11-20']:
    data = df_results[df_results['position_bin'] == group]['error']
    fig.add_trace(
        go.Box(y=data, name=group),
        row=1, col=2
    )

fig.update_xaxes(title_text="Error (positions)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_yaxes(title_text="Error (positions)", row=1, col=2)

fig.update_layout(height=400, showlegend=False)
fig.show()

## 6. Calibration Analysis

Are our confidence levels meaningful?

In [None]:
# Bin by confidence level
df_results['confidence_bin'] = pd.cut(
    df_results['confidence'],
    bins=[0, 50, 65, 80, 100],
    labels=['Low (0-50%)', 'Medium (50-65%)', 'High (65-80%)', 'Very High (80-100%)']
)

# Calculate accuracy in each bin
calibration = df_results.groupby('confidence_bin').agg({
    'error': ['mean', 'count'],
    'predicted_pos': lambda x: (df_results.loc[x.index, 'error'] <= 2).mean() * 100
}).round(2)

calibration.columns = ['MAE', 'Count', 'Within 2 Pos (%)']
print("\nCalibration Table:")
print(calibration)

# Visualization
fig = px.bar(
    calibration.reset_index(),
    x='confidence_bin',
    y='Within 2 Pos (%)',
    title='Model Calibration: Confidence vs Accuracy',
    labels={'confidence_bin': 'Confidence Level', 'Within 2 Pos (%)': 'Accuracy Within 2 Positions (%)'},
    color='Within 2 Pos (%)',
    color_continuous_scale='RdYlGn'
)
fig.add_hline(y=70, line_dash="dash", line_color="red", 
              annotation_text="Target: 70%")
fig.update_layout(height=500)
fig.show()

## 7. Podium Prediction Analysis

In [None]:
# High probability podium predictions
high_prob_podium = df_results[df_results['podium_prob'] > 50]

if len(high_prob_podium) > 0:
    podium_accuracy = high_prob_podium['actual_podium'].mean() * 100
    print("When we predicted >50% podium chance:")
    print(f"  - Number of predictions: {len(high_prob_podium)}")
    print(f"  - Actual podium rate: {podium_accuracy:.1f}%")

# Calibration curve
prob_thresholds = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
podium_rates = []

for thresh in prob_thresholds:
    subset = df_results[df_results['podium_prob'] >= thresh]
    if len(subset) > 0:
        rate = subset['actual_podium'].mean() * 100
        podium_rates.append(rate)
    else:
        podium_rates.append(None)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=prob_thresholds,
    y=podium_rates,
    mode='lines+markers',
    name='Actual',
    line=dict(width=3)
))
fig.add_trace(go.Scatter(
    x=[0, 100],
    y=[0, 100],
    mode='lines',
    name='Perfect calibration',
    line=dict(dash='dash', color='red')
))
fig.update_layout(
    title='Podium Prediction Calibration',
    xaxis_title='Predicted Podium Probability Threshold (%)',
    yaxis_title='Actual Podium Rate (%)',
    height=500
)
fig.show()

## 8. Race-by-Race Performance

In [None]:
race_summary = df_results.groupby(['race', 'round']).agg({
    'error': 'mean',
    'driver': 'count'
}).rename(columns={'error': 'MAE', 'driver': 'Predictions'}).sort_values('round')

print("\nRace-by-Race MAE:")
print(race_summary)

fig = px.line(
    race_summary.reset_index(),
    x='round',
    y='MAE',
    title='Prediction Accuracy Over 2025 Season',
    labels={'round': 'Round', 'MAE': 'Mean Absolute Error'},
    markers=True
)
fig.add_hline(y=mae, line_dash="dash", line_color="red", 
              annotation_text=f"Overall MAE = {mae:.2f}")
fig.update_layout(height=500)
fig.show()

## 9. Driver-Specific Performance

In [None]:
driver_errors = df_results.groupby('driver').agg({
    'error': ['mean', 'count']
}).round(2)
driver_errors.columns = ['MAE', 'Predictions']
driver_errors = driver_errors[driver_errors['Predictions'] >= 5].sort_values('MAE')

print("\nDriver Prediction Accuracy (min 5 predictions):")
print("\nMost Predictable:")
print(driver_errors.head(5))
print("\nLeast Predictable:")
print(driver_errors.tail(5))

# Visualization
fig = px.bar(
    driver_errors.reset_index().head(10),
    x='driver',
    y='MAE',
    title='Top 10 Most Predictable Drivers (2025)',
    labels={'driver': 'Driver', 'MAE': 'Mean Absolute Error'},
    color='MAE',
    color_continuous_scale='RdYlGn_r'
)
fig.update_layout(height=500)
fig.show()

## 10. Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

bins = [0, 3, 10, 20]
labels = ['Podium', 'Points', 'No Points']

df_results['pred_bin'] = pd.cut(df_results['predicted_pos'], bins=bins, labels=labels)
df_results['actual_bin'] = pd.cut(df_results['actual_pos'], bins=bins, labels=labels)

cm = confusion_matrix(df_results['actual_bin'], df_results['pred_bin'], labels=labels)

fig = px.imshow(
    cm,
    labels=dict(x="Predicted", y="Actual", color="Count"),
    x=labels,
    y=labels,
    title='Confusion Matrix: Position Categories',
    text_auto=True,
    color_continuous_scale='Blues'
)
fig.update_layout(height=500)
fig.show()

## Summary

**Model Performance on 2025 Season:**

The validation shows:
- MAE and accuracy metrics quantify prediction quality
- Calibration analysis reveals if confidence scores are meaningful
- Position-specific and driver-specific errors identify where the model struggles
- Bayesian learning improves predictions as the season progresses

**Next Steps for 2026:**
1. Update driver characteristics with 2025 season data
2. Adjust for new team lineups (Cadillac entry, driver changes)
3. Monitor pre-season testing (February 2026) for regulation impact
4. Re-calibrate uncertainty given regulation reset
5. Test predictions during first few races of 2026