In [3]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Enable FastF1 caching
fastf1.Cache.enable_cache("f1_cache")

def get_historical_data(gp_name, years_range):
    """Get historical race data for a specific Grand Prix across multiple years"""
    all_laps_data = []
    
    for year in years_range:
        try:
            # Get year's schedule first
            schedule = fastf1.get_event_schedule(year)
            
            # Check if GP exists in schedule by searching the EventName
            gp = schedule[schedule['EventName'].str.contains(gp_name, case=False, na=False)]
            if gp.empty:
                print(f"No {gp_name} GP in {year}, skipping...")
                continue
                
            # Get session if GP exists
            country = gp['Country'].iloc[0]  # Get country name from schedule
            venue = gp['Location'].iloc[0]   # Get venue name for reference
            session = fastf1.get_session(year, country, "R")
            session.load()
            
            # Extract lap and sector times
            laps = session.laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]].copy()
            laps.dropna(inplace=True)
            
            # Convert times to seconds
            for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
                laps[f"{col} (s)"] = laps[col].dt.total_seconds()
            
            # Add year column
            laps["Year"] = year
            all_laps_data.append(laps)
            print(f"Successfully loaded {year} {gp_name} GP data from {venue}")
        except Exception as e:
            print(f"Error loading {year} data: {str(e)}")
    
    return all_laps_data

# Set target GP and fetch historical data
target_gp = 'Saudi'  # Can be changed to any GP name e.g. 'Monaco', 'British', etc.
years = range(2024, 2017, -1)  # 2024 to 2018

all_laps_data = get_historical_data(target_gp, years)

if not all_laps_data:
    raise ValueError(f"No historical data could be loaded for {target_gp} GP")

# Combine all years data
combined_laps = pd.concat(all_laps_data, ignore_index=True)

# Group by driver to get average sector times across all years
sector_times_historical = combined_laps.groupby("Driver")[["Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].mean().reset_index()

# Print summary of available years
print("\nData available for years:")
print(combined_laps["Year"].unique())

# Get qualifying session for the target GP
target_quali_session = fastf1.get_session(2025, target_gp, "Q")
target_quali_session.load()

# Get qualifying results from the target session
qualifying_results = target_quali_session.results

# Create qualifying DataFrame with required format
qualifying_2025 = pd.DataFrame({
    "Driver": qualifying_results["FullName"],
    "QualifyingTime (s)": qualifying_results["Q3"].dt.total_seconds().fillna(
        qualifying_results["Q2"].dt.total_seconds().fillna(
            qualifying_results["Q1"].dt.total_seconds()
        )
    )
})

# Map full names to FastF1 3-letter codes
qualifying_2025["DriverCode"] = qualifying_results["Abbreviation"]

# Merge qualifying data with historical sector times
merged_data = qualifying_2025.merge(sector_times_historical, left_on="DriverCode", right_on="Driver", how="left")
print (merged_data)


req            INFO 	No cached data found for season_schedule. Loading data...
_api           INFO 	Fetching season schedule...
req            INFO 	Data has been written to cache!


No Saudi GP in 2024, skipping...


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 

Successfully loaded 2023 Saudi GP data from Jeddah


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 

Successfully loaded 2022 Saudi GP data from Jeddah


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 

Successfully loaded 2021 Saudi GP data from Jeddah
No Saudi GP in 2020, skipping...
No Saudi GP in 2019, skipping...
No Saudi GP in 2018, skipping...

Data available for years:
[2023 2022 2021]


core           INFO 	Loading data for Saudi Arabian Grand Prix - Qualifying [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req           

             Driver_x  QualifyingTime (s) DriverCode Driver_y  \
0      Max Verstappen              87.294        VER      VER   
1       Oscar Piastri              87.304        PIA      PIA   
2      George Russell              87.407        RUS      RUS   
3     Charles Leclerc              87.670        LEC      LEC   
4      Kimi Antonelli              87.866        ANT      NaN   
5        Carlos Sainz              88.164        SAI      SAI   
6      Lewis Hamilton              88.201        HAM      HAM   
7        Yuki Tsunoda              88.204        TSU      TSU   
8        Pierre Gasly              88.367        GAS      GAS   
9        Lando Norris              87.481        NOR      NOR   
10    Alexander Albon              88.109        ALB      ALB   
11        Liam Lawson              88.191        LAW      NaN   
12    Fernando Alonso              88.303        ALO      ALO   
13       Isack Hadjar              88.418        HAD      NaN   
14     Oliver Bearman    

In [4]:
# Define feature set (Qualifying + Sector Times) and include DriverCode for reference
X = merged_data[["DriverCode", "QualifyingTime (s)", "Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].fillna(0)
print(X)
# Remove DriverCode column before training (optional, if not needed for model)
X_model = X.drop(columns=["DriverCode"])
# Get mean lap times only for drivers in the qualifying data
y = combined_laps[combined_laps["Driver"].isin(merged_data["Driver_y"])]
y = y.groupby("Driver")["LapTime (s)"].mean()
# Ensure y is in the same order as X
y = y.reindex(merged_data["Driver_y"])

# --- Set NaN in y (new drivers) to mean lap time ---
mean_lap_time = y.mean()
y = y.fillna(mean_lap_time)

# Train Gradient Boosting Model
X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.2, random_state=38)
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=38)
model.fit(X_train, y_train)

# Predict race times using 2025 qualifying and sector data
predicted_race_times = model.predict(X_model)
qualifying_2025["PredictedRaceTime (s)"] = predicted_race_times

# Rank drivers by predicted race time
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime (s)")

# Print final predictions
print("\n🏁 Predicted 2025 GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime (s)"]])

# Evaluate Model
y_pred = model.predict(X_test)
print(f"\n🔍 Model Error (MAE): {mean_absolute_error(y_test, y_pred):.2f} seconds")

   DriverCode  QualifyingTime (s)  Sector1Time (s)  Sector2Time (s)  \
0         VER              87.294        35.422193        30.443356   
1         PIA              87.304        36.869429        30.763694   
2         RUS              87.407        35.633417        30.700447   
3         LEC              87.670        36.001569        30.756051   
4         ANT              87.866         0.000000         0.000000   
5         SAI              88.164        36.054927        30.873788   
6         HAM              88.201        35.965890        30.844346   
7         TSU              88.204        36.913854        30.822899   
8         GAS              88.367        36.732123        31.183609   
9         NOR              87.481        36.773108        31.484835   
10        ALB              88.109        37.367406        32.046203   
11        LAW              88.191         0.000000         0.000000   
12        ALO              88.303        36.456358        30.845683   
13    