In [7]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
import warnings

# Suppress specific warnings from FastF1 and others if needed
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Enable FastF1 caching
try:
    fastf1.Cache.enable_cache("f1_cache")
except Exception as e:
    print(f"Error enabling cache: {e}. Check cache directory permissions.")

def get_driver_standings_before_round(year, target_round):
    """Calculate driver points standings *before* a specific round."""
    standings = {}
    max_points = 0
    try:
        schedule = fastf1.get_event_schedule(year, include_testing=False)
        # Filter rounds before the target round
        races_before = schedule[schedule['RoundNumber'] < target_round]
        
        if races_before.empty:
            return {}, 0 # No races before this one
            
        for index, race in races_before.iterrows():
            try:
                session = fastf1.get_session(year, race['EventName'], 'R')
                session.load(laps=False, telemetry=False, weather=False, messages=False)
                results = session.results
                if results is not None and not results.empty:
                    for _, driver_result in results.iterrows():
                        driver = driver_result['Abbreviation']
                        points = driver_result['Points']
                        standings[driver] = standings.get(driver, 0) + points
            except Exception as e:
                # print(f"Warning: Could not load results for {year} {race['EventName']}: {e}")
                # Silently continue if a past race fails to load
                continue
                
        if standings:
            max_points = max(standings.values()) if standings else 0
            
    except Exception as e:
        print(f"Error calculating standings for {year} before round {target_round}: {e}")
        return {}, 0
        
    return standings, max_points

def get_race_data(year, event_identifier):
    """Get Race and Qualifying data for a specific event."""
    try:
        session_r = fastf1.get_session(year, event_identifier, 'R')
        session_r.load(laps=False, telemetry=False, weather=False, messages=False)
        results_r = session_r.results

        if results_r is None or results_r.empty:
            print(f"No race results found for {year} {event_identifier}. Skipping.")
            return None

        # Ensure Finishing Position is numeric, handle non-finishers (e.g., DNF, DSQ)
        results_r['FinishingPosition'] = pd.to_numeric(results_r['Position'], errors='coerce')
        # Optional: Fill NaNs (non-finishers) with a high number or drop
        results_r.dropna(subset=['FinishingPosition'], inplace=True)
        results_r['FinishingPosition'] = results_r['FinishingPosition'].astype(int)

        # Get Qualifying Data
        session_q = fastf1.get_session(year, event_identifier, 'Q')
        session_q.load(laps=False, telemetry=False, weather=False, messages=False)
        results_q = session_q.results

        if results_q is None or results_q.empty:
            print(f"No qualifying results found for {year} {event_identifier}. Proceeding without Quali times.")
            qualifying_times = pd.DataFrame({'Abbreviation': results_r['Abbreviation'], 'QualifyingTime (s)': np.nan})
        else:
            # Get best qualifying time (Q1, Q2, or Q3)
            results_q['QualifyingTime'] = results_q[['Q1', 'Q2', 'Q3']].min(axis=1)
            results_q['QualifyingTime (s)'] = results_q['QualifyingTime'].dt.total_seconds()
            qualifying_times = results_q[['Abbreviation', 'QualifyingTime (s)']].copy()
        
        # Merge Race results with Qualifying times
        race_data = pd.merge(results_r[['Abbreviation', 'TeamName', 'FinishingPosition']], 
                             qualifying_times, on='Abbreviation', how='left')
        
        # Add Year and RoundNumber
        race_data['Year'] = year
        race_data['RoundNumber'] = session_r.event['RoundNumber']
        race_data['EventName'] = session_r.event['EventName']
        
        # Calculate Points Index *before* this race
        standings, max_points = get_driver_standings_before_round(year, race_data['RoundNumber'].iloc[0])
        
        if max_points > 0:
            race_data['PointsIndex'] = race_data['Abbreviation'].apply(lambda x: standings.get(x, 0) / max_points)
        else:
            race_data['PointsIndex'] = 0.0
            
        # Rename columns for clarity
        race_data.rename(columns={'Abbreviation': 'Driver', 'TeamName': 'Team'}, inplace=True)
        
        # Handle missing Quali times (e.g., fill with mean/median or a penalty value)
        # Simple fill with mean for now:
        mean_quali_time = race_data['QualifyingTime (s)'].mean()
        race_data['QualifyingTime (s)'].fillna(mean_quali_time, inplace=True)
        # Ensure no NaNs remain in core features
        race_data.dropna(subset=['Driver', 'Team', 'FinishingPosition', 'QualifyingTime (s)', 'PointsIndex'], inplace=True)
        
        return race_data[['Year', 'RoundNumber', 'EventName', 'Driver', 'Team', 'QualifyingTime (s)', 'PointsIndex', 'FinishingPosition']]
        
    except Exception as e:
        print(f"Error processing {year} {event_identifier}: {e}")
        return None

# --- Configuration ---
TARGET_YEAR = 2025
TARGET_GP_NAME = 'Saudi' # Example: 'Monaco', 'British', 'Saudi Arabian Grand Prix'
HISTORICAL_YEARS = range(TARGET_YEAR - 1, 2018, -1) # e.g., 2024 down to 2019

# --- Data Gathering ---
all_race_data = []

# 1. Get Target GP Info
try:
    schedule_target_year = fastf1.get_event_schedule(TARGET_YEAR, include_testing=False)
    target_event = schedule_target_year[schedule_target_year['EventName'].str.contains(TARGET_GP_NAME, case=False, na=False)]
    
    if target_event.empty:
        raise ValueError(f"Target GP '{TARGET_GP_NAME}' not found in {TARGET_YEAR} schedule.")
        
    target_round = target_event['RoundNumber'].iloc[0]
    target_venue = target_event['Location'].iloc[0]
    target_event_name = target_event['EventName'].iloc[0] # Official name for consistency
    print(f"Target Event: {TARGET_YEAR} {target_event_name} (Round {target_round}, Venue: {target_venue})")

except Exception as e:
    raise ValueError(f"Error finding target GP info: {e}")

# 2. Get Data from Current Season (up to target GP)
print(f"\n--- Fetching {TARGET_YEAR} data before Round {target_round} ---")
schedule_current_year_filtered = schedule_target_year[schedule_target_year['RoundNumber'] < target_round]
for index, race in schedule_current_year_filtered.iterrows():
    print(f"Processing {TARGET_YEAR} {race['EventName']}...")
    data = get_race_data(TARGET_YEAR, race['EventName'])
    if data is not None:
        all_race_data.append(data)

# 3. Get Historical Data from the Same Venue
print(f"\n--- Fetching Historical Data for Venue: {target_venue} ---")
for year in HISTORICAL_YEARS:
    try:
        schedule_hist = fastf1.get_event_schedule(year, include_testing=False)
        venue_event = schedule_hist[schedule_hist['Location'] == target_venue]
        if not venue_event.empty:
            event_name_hist = venue_event['EventName'].iloc[0]
            print(f"Processing {year} {event_name_hist} (Venue: {target_venue})...")
            data = get_race_data(year, event_name_hist)
            if data is not None:
                all_race_data.append(data)
        else:
            print(f"No race found at {target_venue} in {year}.")
    except Exception as e:
        print(f"Error getting schedule for {year}: {e}")

# 4. Combine Data
if not all_race_data:
    raise ValueError("No data could be collected. Check FastF1 setup and GP/Year validity.")

combined_data = pd.concat(all_race_data, ignore_index=True)

# Drop the RoundNumber column
if 'RoundNumber' in combined_data.columns:
    combined_data.drop(columns=['RoundNumber'], inplace=True)
    print("Dropped 'RoundNumber' column.")
else:
    print("'RoundNumber' column not found in combined_data.")

print(f"\n--- Combined Data Shape after dropping RoundNumber: {combined_data.shape} ---")
print(combined_data.head())
print(combined_data.info())
print(combined_data.describe())

# Export the combined dataset to CSV
try:
    csv_filename = 'combined_f1_data.csv'
    combined_data.to_csv(csv_filename, index=False)
    print(f"\n--- Full Combined Dataset exported to {csv_filename} ---")
except Exception as e:
    print(f"\nError exporting data to CSV: {e}")

# 5. Prepare Data for Target Race Prediction
print(f"\n--- Preparing data for {TARGET_YEAR} {target_event_name} prediction ---")
try:
    # Get Qualifying data for the actual target race
    target_q_session = fastf1.get_session(TARGET_YEAR, target_event_name, 'Q')
    target_q_session.load(laps=False, telemetry=False, weather=False, messages=False)
    target_q_results = target_q_session.results
    
    if target_q_results is None or target_q_results.empty:
         raise ValueError(f"Could not load Qualifying results for target event: {TARGET_YEAR} {target_event_name}")
         
    target_q_results['QualifyingTime'] = target_q_results[['Q1', 'Q2', 'Q3']].min(axis=1)
    target_q_results['QualifyingTime (s)'] = target_q_results['QualifyingTime'].dt.total_seconds()
    
    # Get points index before the target race
    target_standings, target_max_points = get_driver_standings_before_round(TARGET_YEAR, target_round)
    
    # Create DataFrame for prediction
    predict_df = target_q_results[['Abbreviation', 'TeamName', 'QualifyingTime (s)']].copy()
    predict_df.rename(columns={'Abbreviation': 'Driver', 'TeamName': 'Team'}, inplace=True)
    
    if target_max_points > 0:
         predict_df['PointsIndex'] = predict_df['Driver'].apply(lambda x: target_standings.get(x, 0) / target_max_points)
    else:
         predict_df['PointsIndex'] = 0.0
         
    # Handle potential missing Quali times in prediction set (e.g., pit lane start)
    # Use mean from the combined *training* data to fill
    mean_quali_train = combined_data['QualifyingTime (s)'].mean()
    predict_df['QualifyingTime (s)'].fillna(mean_quali_train, inplace=True)
    
    # Keep track of drivers for final output
    predict_drivers = predict_df['Driver'].tolist()
    
    print(f"Prediction input data shape: {predict_df.shape}")
    print(predict_df)
    
except Exception as e:
    raise ValueError(f"Error preparing prediction data: {e}")

core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Loading data for Australian Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Loading data for Australian Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	

Target Event: 2025 Saudi Arabian Grand Prix (Round 5, Venue: Jeddah)

--- Fetching 2025 data before Round 5 ---
Processing 2025 Australian Grand Prix...
Processing 2025 Chinese Grand Prix...


core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Loading data for Japanese Grand Prix - Race [v3.5.3]
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
core           INFO 	Loading data for Japanese Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '12',

Processing 2025 Japanese Grand Prix...
Processing 2025 Bahrain Grand Prix...


core           INFO 	Loading data for Bahrain Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['81', '63', '16', '12', '10', '4', '1', '55', '44', '22', '7', '6', '14', '31', '23', '27', '30', '5', '18', '87']
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['81', '63', '16', '12', '10', '4', '1', '55', '44', '22', '7', '6', '14', '31', '23', '27', '30', '5', '18', '87']
core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Loading data for Australian Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cac


--- Fetching Historical Data for Venue: Jeddah ---
Processing 2024 Saudi Arabian Grand Prix (Venue: Jeddah)...
Processing 2023 Saudi Arabian Grand Prix (Venue: Jeddah)...


core           INFO 	Loading data for Saudi Arabian Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['11', '16', '14', '63', '55', '18', '31', '44', '81', '10', '27', '24', '20', '77', '1', '22', '23', '21', '4', '2']
core           INFO 	Finished loading data for 20 drivers: ['11', '16', '14', '63', '55', '18', '31', '44', '81', '10', '27', '24', '20', '77', '1', '22', '23', '21', '4', '2']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using

Processing 2022 Saudi Arabian Grand Prix (Venue: Jeddah)...
Processing 2021 Saudi Arabian Grand Prix (Venue: Jeddah)...


core           INFO 	Finished loading data for 20 drivers: ['44', '77', '33', '16', '11', '10', '4', '22', '31', '99', '3', '7', '14', '63', '55', '6', '5', '18', '47', '9']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '4', '11', '16', '3', '55', '22', '18', '7', '99', '31', '63', '5', '47', '10', '6', '14', '9']
core           INFO 	Finished loading data for 20 drivers: ['44', '33', '77', '4', '11', '16', '3', '55', '22', '18', '7', '99', '31', '63', '5', '47', '10', '6', '14', '9']
core           INFO 	Loading data for Emilia Romagna Grand Prix - Race [v3.5.3]
req            INFO 	Using cac

No race found at Jeddah in 2020.
No race found at Jeddah in 2019.

--- Combined Data Shape: (159, 8) ---
   Year  RoundNumber              EventName Driver             Team  \
0  2025            1  Australian Grand Prix    NOR          McLaren   
1  2025            1  Australian Grand Prix    VER  Red Bull Racing   
2  2025            1  Australian Grand Prix    RUS         Mercedes   
3  2025            1  Australian Grand Prix    ANT         Mercedes   
4  2025            1  Australian Grand Prix    ALB         Williams   

   QualifyingTime (s)  PointsIndex  FinishingPosition  
0              75.096          0.0                  1  
1              75.481          0.0                  2  
2              75.546          0.0                  3  
3              76.525          0.0                  4  
4              75.737          0.0                  5  
<class 'fastf1.core.SessionResults'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 8 columns):
 #   Column              Non-

core           INFO 	Finished loading data for 20 drivers: ['1', '4', '81', '16', '63', '12', '44', '6', '23', '87', '14', '22', '10', '55', '7', '27', '30', '31', '5', '18']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['81', '63', '4', '16', '44', '1', '10', '31', '22', '87', '12', '23', '6', '7', '14', '30', '18', '5', '55', '27']
core           INFO 	Finished loading data for 20 drivers: ['81', '63', '4', '16', '44', '1', '10', '31', '22', '87', '12', '23', '6', '7', '14', '30', '18', '5', '55', '27']


Prediction input data shape: (20, 4)
   Driver             Team  QualifyingTime (s)  PointsIndex
1     VER  Red Bull Racing              87.294     0.828947
81    PIA          McLaren              87.304     0.881579
63    RUS         Mercedes              87.407     0.763158
16    LEC          Ferrari              87.670     0.368421
12    ANT         Mercedes              87.798     0.368421
55    SAI         Williams              88.024     0.013158
44    HAM          Ferrari              88.102     0.223684
22    TSU  Red Bull Racing              87.990     0.026316
10    GAS           Alpine              88.025     0.078947
4     NOR          McLaren              87.481     1.000000
23    ALB         Williams              88.109     0.236842
30    LAW     Racing Bulls              88.191     0.000000
14    ALO     Aston Martin              88.303     0.000000
6     HAD     Racing Bulls              88.418     0.052632
87    BEA     Haas F1 Team              88.536     0.078947
18 

In [9]:
# --- Feature Engineering & Model Training ---

# Define features and target
features = ['QualifyingTime (s)', 'PointsIndex', 'Driver', 'Team']
target = 'FinishingPosition'

# Separate features (X) and target (y) from combined data
X = combined_data[features].copy()
y = combined_data[target].copy()

# Handle Categorical Features (Driver, Team) using One-Hot Encoding
categorical_features = ['Driver', 'Team']
X_categorical = X[categorical_features]
X_numerical = X.drop(columns=categorical_features)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # handle_unknown='ignore' is crucial for new drivers/teams in prediction set
X_encoded = encoder.fit_transform(X_categorical)

# Create DataFrame from encoded features
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Combine numerical and encoded categorical features for training data
X_processed = pd.concat([X_numerical.reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)

print(f"\n--- Processed Training Data Shape: {X_processed.shape} ---")
print(X_processed.head())

# --- Prepare Prediction Data (Apply same encoding) ---
predict_X_categorical = predict_df[categorical_features]
predict_X_numerical = predict_df.drop(columns=categorical_features)

# Use the *fitted* encoder to transform the prediction data
predict_X_encoded = encoder.transform(predict_X_categorical)
predict_X_encoded_df = pd.DataFrame(predict_X_encoded, columns=encoder.get_feature_names_out(categorical_features))

# Combine numerical and encoded categorical features for prediction data
predict_X_processed = pd.concat([predict_X_numerical.reset_index(drop=True), predict_X_encoded_df.reset_index(drop=True)], axis=1)

# Ensure columns match exactly between training and prediction sets
# (This handles cases where a driver/team was only in the prediction set or vice-versa due to handle_unknown='ignore')
train_cols = set(X_processed.columns)
predict_cols = set(predict_X_processed.columns)

# Add missing columns to predict_X_processed (filled with 0)
for col in train_cols - predict_cols:
    predict_X_processed[col] = 0

# Remove extra columns from predict_X_processed (shouldn't happen with handle_unknown='ignore')
for col in predict_cols - train_cols:
    predict_X_processed.drop(columns=[col], inplace=True)

# Reorder prediction columns to match training columns
predict_X_processed = predict_X_processed[X_processed.columns]

print(f"\n--- Processed Prediction Data Shape: {predict_X_processed.shape} ---")
print(predict_X_processed.head())

# --- Train Gradient Boosting Model ---
# Split the historical/current season data for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=4, random_state=42)
model.fit(X_train, y_train)

# --- Predict Finishing Positions for Target Race ---
predicted_positions = model.predict(predict_X_processed)

# Create results DataFrame
results_df = pd.DataFrame({'Driver': predict_drivers, 'PredictedFinishingPosition_Raw': predicted_positions})

# Round predictions and convert to integer rank
# Note: Simple rounding might produce duplicate ranks. A more robust ranking could be used.
results_df['PredictedFinishingPosition'] = results_df['PredictedFinishingPosition_Raw'].round().astype(int)
# Ensure positions are at least 1
results_df['PredictedFinishingPosition'] = results_df['PredictedFinishingPosition'].clip(lower=1)
# Sort by predicted position
results_df = results_df.sort_values(by='PredictedFinishingPosition')

# Print final predictions
print(f"\n🏁 Predicted Finishing Order for {TARGET_YEAR} {target_event_name} 🏁\n")
print(results_df[['Driver', 'PredictedFinishingPosition']])

# --- Evaluate Model on Test Set ---
y_pred_test = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_test)
print(f"\n🔍 Model Evaluation (MAE on test set): {mae:.2f} positions")

# --- Feature Importance ---
print("\n📊 Feature Importances:")
try:
    feature_names = X_processed.columns
    importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    # Display top N features or all features
    with pd.option_context('display.max_rows', None): # Ensure all rows are printed
        print(importance_df)
except Exception as e:
    print(f"Could not calculate or display feature importances: {e}")


--- Processed Training Data Shape: (159, 49) ---
   QualifyingTime (s)  PointsIndex  Driver_ALB  Driver_ALO  Driver_ANT  \
0              75.096          0.0         0.0         0.0         0.0   
1              75.481          0.0         0.0         0.0         0.0   
2              75.546          0.0         0.0         0.0         0.0   
3              76.525          0.0         0.0         0.0         1.0   
4              75.737          0.0         1.0         0.0         0.0   

   Driver_BEA  Driver_BOR  Driver_BOT  Driver_DEV  Driver_DOO  ...  \
0         0.0         0.0         0.0         0.0         0.0  ...   
1         0.0         0.0         0.0         0.0         0.0  ...   
2         0.0         0.0         0.0         0.0         0.0  ...   
3         0.0         0.0         0.0         0.0         0.0  ...   
4         0.0         0.0         0.0         0.0         0.0  ...   

   Team_Aston Martin  Team_Ferrari  Team_Haas F1 Team  Team_Kick Sauber  \
0        