In [None]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression


fastf1.Cache.enable_cache("f1_cache")

def get_historical_data(gp_name, years_range):  #get data from previous years
    #it gets the schedule for the year, checks if the GP happened then gets the session data
    #and extracts the lap and sector times for each driver
    all_laps_data = []
    for year in years_range:
        try:
            schedule = fastf1.get_event_schedule(year)
            gp = schedule[schedule['EventName'].str.contains(gp_name, case=False, na=False)]
            if gp.empty:
                print(f"No {gp_name} GP in {year}, skipping...")
                continue
            country = gp['Country'].iloc[0]  
            venue = gp['Location'].iloc[0]  
            session = fastf1.get_session(year, country, "R")
            session.load()
            
            
            laps = session.laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]].copy()
            laps.dropna(inplace=True)
            
            # Convert times to seconds
            for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
                laps[f"{col} (s)"] = laps[col].dt.total_seconds()
            
            
            laps["Year"] = year
            all_laps_data.append(laps)
            print(f"Successfully loaded {year} {gp_name} GP data from {venue}")
        except Exception as e:
            print(f"Error loading {year} data: {str(e)}")
    
    return all_laps_data

# --- NEW: Get constructor points for 2025 before the Saudi GP ---
def get_constructor_points_before_event(year, target_gp):
    schedule = fastf1.get_event_schedule(year)
    # Find the round number of the target GP
    gp_row = schedule[schedule['EventName'].str.contains(target_gp, case=False, na=False)]
    if gp_row.empty:
        raise ValueError(f"No {target_gp} GP in {year}")
    target_round = gp_row['RoundNumber'].iloc[0]
    # Sum points for all races before the target GP
    points_by_team = {}
    for rnd in range(1, target_round):
        try:
            session = fastf1.get_session(year, rnd, "R")
            session.load()
            results = session.results
            for idx, row in results.iterrows():
                team = row['TeamName']
                pts = row['Points'] if not pd.isnull(row['Points']) else 0
                points_by_team[team] = points_by_team.get(team, 0) + pts
        except Exception as e:
            print(f"Error loading round {rnd}: {e}")
    return points_by_team


target_gp = 'Saudi'  
years = range(2024, 2017, -1)  

all_laps_data = get_historical_data(target_gp, years)

if not all_laps_data:
    raise ValueError(f"No historical data could be loaded for {target_gp} GP")


combined_laps = pd.concat(all_laps_data, ignore_index=True)


sector_times_historical = combined_laps.groupby("Driver")[["Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].mean().reset_index()


print("\nData available for years:")
print(combined_laps["Year"].unique())

# Get qualifying session for the target GP
target_quali_session = fastf1.get_session(2025, target_gp, "Q")
target_quali_session.load()

# Get qualifying results from the target session
qualifying_results = target_quali_session.results

# Create qualifying DataFrame with required format
qualifying_2025 = pd.DataFrame({
    "Driver": qualifying_results["FullName"],
    "QualifyingTime (s)": qualifying_results["Q3"].dt.total_seconds().fillna(
        qualifying_results["Q2"].dt.total_seconds().fillna(
            qualifying_results["Q1"].dt.total_seconds()
        )
    )
})

# Map full names to FastF1 3-letter codes
qualifying_2025["DriverCode"] = qualifying_results["Abbreviation"]
# Map team names
qualifying_2025["TeamName"] = qualifying_results["TeamName"]

# --- NEW: Add constructor performance index ---
constructor_points = get_constructor_points_before_event(2025, target_gp)
if constructor_points:
    max_points = max(constructor_points.values()) if constructor_points else 1
    qualifying_2025["ConstructorPerformanceIndex"] = qualifying_2025["TeamName"].map(lambda t: constructor_points.get(t, 0) / max_points * 100 if max_points > 0 else 0)
else:
    qualifying_2025["ConstructorPerformanceIndex"] = 0

# Merge qualifying data with historical sector times
merged_data = qualifying_2025.merge(sector_times_historical, left_on="DriverCode", right_on="Driver", how="left")
print (merged_data)


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req

Successfully loaded 2024 Saudi GP data from Jeddah


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req

Successfully loaded 2023 Saudi GP data from Jeddah


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req

Successfully loaded 2022 Saudi GP data from Jeddah


core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req

Successfully loaded 2021 Saudi GP data from Jeddah
No Saudi GP in 2020, skipping...
No Saudi GP in 2020, skipping...
No Saudi GP in 2019, skipping...
No Saudi GP in 2019, skipping...
No Saudi GP in 2018, skipping...

Data available for years:
[2024 2023 2022 2021]
No Saudi GP in 2018, skipping...

Data available for years:
[2024 2023 2022 2021]


core           INFO 	Loading data for Saudi Arabian Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for car_data

             Driver_x  QualifyingTime (s) DriverCode         TeamName  \
0      Max Verstappen              87.294        VER  Red Bull Racing   
1       Oscar Piastri              87.304        PIA          McLaren   
2      George Russell              87.407        RUS         Mercedes   
3     Charles Leclerc              87.670        LEC          Ferrari   
4      Kimi Antonelli              87.866        ANT         Mercedes   
5        Carlos Sainz              88.164        SAI         Williams   
6      Lewis Hamilton              88.201        HAM          Ferrari   
7        Yuki Tsunoda              88.204        TSU  Red Bull Racing   
8        Pierre Gasly              88.367        GAS           Alpine   
9        Lando Norris              87.481        NOR          McLaren   
10    Alexander Albon              88.109        ALB         Williams   
11        Liam Lawson              88.191        LAW     Racing Bulls   
12    Fernando Alonso              88.303        AL

In [None]:
# Define feature set (Qualifying + Sector Times) and include DriverCode for reference
X = merged_data[["DriverCode", "QualifyingTime (s)", "Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)", "ConstructorPerformanceIndex"]].fillna(0)
print(X)
# Remove DriverCode column before training (optional, if not needed for model)
X_model = X.drop(columns=["DriverCode"])
# Get mean lap times only for drivers in the qualifying data
y = combined_laps[combined_laps["Driver"].isin(merged_data["Driver_y"])]
y = y.groupby("Driver")["LapTime (s)"].mean()
# Ensure y is in the same order as X
y = y.reindex(merged_data["Driver_y"])

# --- Set NaN in y (new drivers) to mean lap time ---
mean_lap_time = y.mean()
y = y.fillna(mean_lap_time)

# Train Gradient Boosting Model
X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.2, random_state=38)
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=38)
model.fit(X_train, y_train)

# Print feature importances
print("Feature importances:", model.feature_importances_)
print("Feature names:", X_model.columns.tolist())

# Predict race times using 2025 qualifying and sector data
predicted_race_times = model.predict(X_model)
qualifying_2025["PredictedRaceTime (s)"] = predicted_race_times

# Rank drivers by predicted race time
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime (s)")

# Print final predictions
print("\n🏁 Predicted 2025 GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime (s)"]])

# Evaluate Model
y_pred = model.predict(X_test)
print(f"\n🔍 Model Error (MAE): {mean_absolute_error(y_test, y_pred):.2f} seconds")

# Train Linear Regression Model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Print feature importances (coefficients) for linear regression
print("Linear Regression coefficients:", linreg.coef_)
print("Feature names:", X_model.columns.tolist())

# Predict race times using linear regression
predicted_race_times_linreg = linreg.predict(X_model)
qualifying_2025["PredictedRaceTime_LinReg (s)"] = predicted_race_times_linreg

# Rank drivers by predicted race time (linear regression)
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime_LinReg (s)")

# Print final predictions for linear regression
print("\n🏁 Linear Regression Predicted 2025 GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime_LinReg (s)"]])

# Evaluate Linear Regression Model
y_pred_linreg = linreg.predict(X_test)
print(f"\n🔍 Linear Regression Model Error (MAE): {mean_absolute_error(y_test, y_pred_linreg):.2f} seconds")

   DriverCode  QualifyingTime (s)  Sector1Time (s)  Sector2Time (s)  \
0         VER              87.294        35.099885        30.198665   
1         PIA              87.304        35.790333        30.256833   
2         RUS              87.407        35.419227        30.382480   
3         LEC              87.670        35.595592        30.489891   
4         ANT              87.866         0.000000         0.000000   
5         SAI              88.164        36.054927        30.873788   
6         HAM              88.201        35.792869        30.525093   
7         TSU              88.204        36.679721        30.710279   
8         GAS              88.367        36.732123        31.183609   
9         NOR              87.481        36.259435        31.069452   
10        ALB              88.109        36.821111        31.362214   
11        LAW              88.191         0.000000         0.000000   
12        ALO              88.303        35.943335        30.544743   
13    