In [None]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression


fastf1.Cache.enable_cache("f1_cache")

def get_historical_data(gp_name, years_range):  #get data from previous years
    #it gets the schedule for the year, checks if the GP happened then gets the session data
    #and extracts the lap and sector times for each driver
    all_laps_data = []
    for year in years_range:
        try:
            schedule = fastf1.get_event_schedule(year)
            gp = schedule[schedule['EventName'].str.contains(gp_name, case=False, na=False)]
            if gp.empty:
                print(f"No {gp_name} GP in {year}, skipping...")
                continue
            country = gp['Country'].iloc[0]  
            venue = gp['Location'].iloc[0]  
            session = fastf1.get_session(year, country, "R")
            session.load()
            
            
            laps = session.laps[["Driver", "LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]].copy()
            laps.dropna(inplace=True)
            
            # Convert times to seconds
            for col in ["LapTime", "Sector1Time", "Sector2Time", "Sector3Time"]:
                laps[f"{col} (s)"] = laps[col].dt.total_seconds()
            
            
            laps["Year"] = year
            all_laps_data.append(laps)
            print(f"Successfully loaded {year} {gp_name} GP data from {venue}")
        except Exception as e:
            print(f"Error loading {year} data: {str(e)}")
    
    return all_laps_data

# --- NEW: Get driver points for 2025 before the Saudi GP ---
def get_driver_points_before_event(year, target_gp):
    schedule = fastf1.get_event_schedule(year)
    gp_row = schedule[schedule['EventName'].str.contains(target_gp, case=False, na=False)]
    if gp_row.empty:
        raise ValueError(f"No {target_gp} GP in {year}")
    target_round = gp_row['RoundNumber'].iloc[0]
    points_by_driver = {}
    for rnd in range(1, target_round):
        try:
            session = fastf1.get_session(year, rnd, "R")
            session.load()
            results = session.results
            for idx, row in results.iterrows():
                driver = row['Abbreviation']
                pts = row['Points'] if not pd.isnull(row['Points']) else 0
                points_by_driver[driver] = points_by_driver.get(driver, 0) + pts
        except Exception as e:
            print(f"Error loading round {rnd}: {e}")
    return points_by_driver


target_gp = 'Japanese'  
years = range(2024, 2017, -1)  

all_laps_data = get_historical_data(target_gp, years)

if not all_laps_data:
    raise ValueError(f"No historical data could be loaded for {target_gp} GP")


combined_laps = pd.concat(all_laps_data, ignore_index=True)


sector_times_historical = combined_laps.groupby("Driver")[["Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)"]].mean().reset_index()


print("\nData available for years:")
print(combined_laps["Year"].unique())

# Get qualifying session for the target GP
target_quali_session = fastf1.get_session(2025, target_gp, "Q")
target_quali_session.load()

# Get qualifying results from the target session
qualifying_results = target_quali_session.results

# Create qualifying DataFrame with required format
qualifying_2025 = pd.DataFrame({
    "Driver": qualifying_results["FullName"],
    "QualifyingTime (s)": qualifying_results["Q3"].dt.total_seconds().fillna(
        qualifying_results["Q2"].dt.total_seconds().fillna(
            qualifying_results["Q1"].dt.total_seconds()
        )
    )
})

# Map full names to FastF1 3-letter codes
qualifying_2025["DriverCode"] = qualifying_results["Abbreviation"]
# Map team names
qualifying_2025["TeamName"] = qualifying_results["TeamName"]

# --- Get driver points and compute performance index ---
driver_points = get_driver_points_before_event(2025, target_gp)
if driver_points:
    max_driver_points = max(driver_points.values()) if driver_points else 1
    qualifying_2025["DriverPerformanceIndex"] = qualifying_2025["DriverCode"].map(lambda d: driver_points.get(d, 0) / max_driver_points if max_driver_points > 0 else 0)
else:
    qualifying_2025["DriverPerformanceIndex"] = 0

# Merge qualifying data with historical sector times
print("qualifying_2025 DriverCode values:")
print(qualifying_2025["DriverCode"].unique())
print("sector_times_historical Driver values:")
print(sector_times_historical["Driver"].unique())
merged_data = qualifying_2025.merge(sector_times_historical, left_on="DriverCode", right_on="Driver", how="left")
print (merged_data)


In [21]:
# Define feature set (Qualifying + Sector Times) and include DriverCode for reference
X = merged_data[["DriverCode", "QualifyingTime (s)", "Sector1Time (s)", "Sector2Time (s)", "Sector3Time (s)", "DriverPerformanceIndex"]].fillna(0)
print(X)
# Remove DriverCode column before training (optional, if not needed for model)
X_model = X.drop(columns=["DriverCode"])
# Get mean lap times only for drivers in the qualifying data
y = combined_laps[combined_laps["Driver"].isin(merged_data["Driver_y"])]
y = y.groupby("Driver")["LapTime (s)"].mean()
# Ensure y is in the same order as X
y = y.reindex(merged_data["Driver_y"])

# --- Set NaN in y (new drivers) to mean lap time ---
mean_lap_time = y.mean()
y = y.fillna(mean_lap_time)

# Train Gradient Boosting Model
X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.2, random_state=38)
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=38)
model.fit(X_train, y_train)

# Print feature importances
print("Feature importances:", model.feature_importances_)
print("Feature names:", X_model.columns.tolist())

# Predict race times using 2025 qualifying and sector data
predicted_race_times = model.predict(X_model)
qualifying_2025["PredictedRaceTime (s)"] = predicted_race_times

# Rank drivers by predicted race time
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime (s)")

# Print final predictions
print("\n🏁 Predicted 2025 GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime (s)"]])

# Evaluate Model
y_pred = model.predict(X_test)
print(f"\n🔍 Model Error (MAE): {mean_absolute_error(y_test, y_pred):.2f} seconds")

# Train Linear Regression Model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Print feature importances (coefficients) for linear regression
print("Linear Regression coefficients:", linreg.coef_)
print("Feature names:", X_model.columns.tolist())

# Predict race times using linear regression
predicted_race_times_linreg = linreg.predict(X_model)
qualifying_2025["PredictedRaceTime_LinReg (s)"] = predicted_race_times_linreg

# Rank drivers by predicted race time (linear regression)
qualifying_2025 = qualifying_2025.sort_values(by="PredictedRaceTime_LinReg (s)")

# Print final predictions for linear regression
print("\n🏁 Linear Regression Predicted 2025 GP Winner with New Drivers and Sector Times 🏁\n")
print(qualifying_2025[["Driver", "PredictedRaceTime_LinReg (s)"]])

# Evaluate Linear Regression Model
y_pred_linreg = linreg.predict(X_test)
print(f"\n🔍 Linear Regression Model Error (MAE): {mean_absolute_error(y_test, y_pred_linreg):.2f} seconds")

   DriverCode  QualifyingTime (s)  Sector1Time (s)  Sector2Time (s)  \
0         LEC              70.270        21.718097        38.685772   
1         PIA              70.424        21.334500        38.625671   
2         SAI              70.518        21.277443        37.875198   
3         NOR              70.542        21.392872        38.161166   
4         RUS              70.543        21.587591        38.243409   
5         VER              70.567        21.225331        37.713270   
6         HAM              70.621        21.350744        37.833709   
7         TSU              70.858        21.977302        38.878757   
8         ALB              70.948        22.239785        39.220764   
9         GAS              71.311        21.419611        37.856984   
10        OCO              71.285        21.775818        38.324623   
11        RIC              71.482        21.581886        37.799120   
12        STR              71.563        21.402887        37.873829   
13    