In [None]:
import fastf1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time  # Adding time module for implementing delays

# Suppress specific warnings from FastF1 and others if needed
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

# Add rate limiting for API calls
def api_call_with_rate_limit(func, *args, **kwargs):
    """Wrapper to add rate limiting to FastF1 API calls"""
    result = func(*args, **kwargs)
    time.sleep(0.3)  # Sleep for 0.3 seconds to ensure we respect the 0.25s limit
    return result

# Enable FastF1 caching
try:
    fastf1.Cache.enable_cache("f1_cache")
except Exception as e:
    print(f"Error enabling cache: {e}. Check cache directory permissions.")

def get_driver_features(driver_code):
    """Return driver-specific features based on driver code."""
    # Driver characteristics dictionary
    # Values are normalized between 0-1 where appropriate
    driver_characteristics = {
        # Driver Code: {features}
        # Current drivers (2025 season)
        'VER': {'WetWeatherSkill': 0.95, 'QualifyingPace': 0.95, 'RaceCraft': 0.95, 'Consistency': 0.90, 'Aggression': 0.90, 'TireManagement': 0.85},
        'NOR': {'WetWeatherSkill': 0.85, 'QualifyingPace': 0.92, 'RaceCraft': 0.88, 'Consistency': 0.88, 'Aggression': 0.75, 'TireManagement': 0.87},
        'PIA': {'WetWeatherSkill': 0.82, 'QualifyingPace': 0.88, 'RaceCraft': 0.84, 'Consistency': 0.85, 'Aggression': 0.78, 'TireManagement': 0.85},
        'LEC': {'WetWeatherSkill': 0.82, 'QualifyingPace': 0.95, 'RaceCraft': 0.85, 'Consistency': 0.80, 'Aggression': 0.85, 'TireManagement': 0.80},
        'HAM': {'WetWeatherSkill': 0.95, 'QualifyingPace': 0.90, 'RaceCraft': 0.95, 'Consistency': 0.92, 'Aggression': 0.80, 'TireManagement': 0.90},
        'RUS': {'WetWeatherSkill': 0.80, 'QualifyingPace': 0.90, 'RaceCraft': 0.85, 'Consistency': 0.85, 'Aggression': 0.83, 'TireManagement': 0.82},
        'ANT': {'WetWeatherSkill': 0.78, 'QualifyingPace': 0.85, 'RaceCraft': 0.78, 'Consistency': 0.76, 'Aggression': 0.85, 'TireManagement': 0.75},
        'SAI': {'WetWeatherSkill': 0.84, 'QualifyingPace': 0.88, 'RaceCraft': 0.87, 'Consistency': 0.87, 'Aggression': 0.82, 'TireManagement': 0.85},
        'ALO': {'WetWeatherSkill': 0.92, 'QualifyingPace': 0.90, 'RaceCraft': 0.95, 'Consistency': 0.90, 'Aggression': 0.88, 'TireManagement': 0.90},
        'STR': {'WetWeatherSkill': 0.85, 'QualifyingPace': 0.75, 'RaceCraft': 0.80, 'Consistency': 0.75, 'Aggression': 0.75, 'TireManagement': 0.78},
        'TSU': {'WetWeatherSkill': 0.78, 'QualifyingPace': 0.85, 'RaceCraft': 0.82, 'Consistency': 0.75, 'Aggression': 0.90, 'TireManagement': 0.78},
        'HAD': {'WetWeatherSkill': 0.75, 'QualifyingPace': 0.80, 'RaceCraft': 0.78, 'Consistency': 0.74, 'Aggression': 0.85, 'TireManagement': 0.74},
        'ALB': {'WetWeatherSkill': 0.80, 'QualifyingPace': 0.85, 'RaceCraft': 0.84, 'Consistency': 0.82, 'Aggression': 0.75, 'TireManagement': 0.83},
        'GAS': {'WetWeatherSkill': 0.83, 'QualifyingPace': 0.84, 'RaceCraft': 0.85, 'Consistency': 0.83, 'Aggression': 0.80, 'TireManagement': 0.82},
        'OCO': {'WetWeatherSkill': 0.80, 'QualifyingPace': 0.82, 'RaceCraft': 0.83, 'Consistency': 0.80, 'Aggression': 0.85, 'TireManagement': 0.78},
        'HUL': {'WetWeatherSkill': 0.85, 'QualifyingPace': 0.82, 'RaceCraft': 0.80, 'Consistency': 0.83, 'Aggression': 0.75, 'TireManagement': 0.80},
        'BOR': {'WetWeatherSkill': 0.74, 'QualifyingPace': 0.76, 'RaceCraft': 0.76, 'Consistency': 0.74, 'Aggression': 0.82, 'TireManagement': 0.75},
        'DOO': {'WetWeatherSkill': 0.74, 'QualifyingPace': 0.78, 'RaceCraft': 0.75, 'Consistency': 0.73, 'Aggression': 0.85, 'TireManagement': 0.74},
        'BEA': {'WetWeatherSkill': 0.75, 'QualifyingPace': 0.78, 'RaceCraft': 0.77, 'Consistency': 0.75, 'Aggression': 0.80, 'TireManagement': 0.76},
        'LAW': {'WetWeatherSkill': 0.76, 'QualifyingPace': 0.79, 'RaceCraft': 0.78, 'Consistency': 0.75, 'Aggression': 0.85, 'TireManagement': 0.75},
        # Historical drivers (for reference)
        '44': {'WetWeatherSkill': 0.95, 'QualifyingPace': 0.90, 'RaceCraft': 0.95, 'Consistency': 0.92, 'Aggression': 0.80, 'TireManagement': 0.90},
        '33': {'WetWeatherSkill': 0.95, 'QualifyingPace': 0.95, 'RaceCraft': 0.95, 'Consistency': 0.90, 'Aggression': 0.90, 'TireManagement': 0.85},
        '16': {'WetWeatherSkill': 0.82, 'QualifyingPace': 0.95, 'RaceCraft': 0.85, 'Consistency': 0.80, 'Aggression': 0.85, 'TireManagement': 0.80},
        '55': {'WetWeatherSkill': 0.84, 'QualifyingPace': 0.88, 'RaceCraft': 0.87, 'Consistency': 0.87, 'Aggression': 0.82, 'TireManagement': 0.85},
    }
    
    # Default values if driver not found
    default_features = {
        'WetWeatherSkill': 0.75, 
        'QualifyingPace': 0.75, 
        'RaceCraft': 0.75, 
        'Consistency': 0.75, 
        'Aggression': 0.75, 
        'TireManagement': 0.75
    }
    
    # Return driver features if found, otherwise default
    if driver_code in driver_characteristics:
        return driver_characteristics[driver_code]
    # Try to handle numeric driver codes from historical data
    elif str(driver_code) in driver_characteristics:
        return driver_characteristics[str(driver_code)]
    else:
        print(f"Driver characteristics not found for '{driver_code}'. Using default values.")
        return default_features

def get_standings_before_round(year, target_round):
    """Calculate driver and constructor points standings *before* a specific round."""
    driver_standings = {}
    constructor_standings = {}
    max_driver_points = 0
    max_constructor_points = 0
    try:
        schedule = api_call_with_rate_limit(fastf1.get_event_schedule, year, include_testing=False)
        races_before = schedule[schedule['RoundNumber'] < target_round]

        if races_before.empty:
            return {}, 0, {}, 0 # No races before this one

        for index, race in races_before.iterrows():
            try:
                session = api_call_with_rate_limit(fastf1.get_session, year, race['EventName'], 'R')
                session.load(laps=False, telemetry=False, weather=False, messages=False)
                results = session.results
                if results is not None and not results.empty:
                    for _, driver_result in results.iterrows():
                        driver = driver_result['Abbreviation']
                        team = driver_result['TeamName']
                        points = driver_result['Points']
                        # Update driver standings
                        driver_standings[driver] = driver_standings.get(driver, 0) + points
                        # Update constructor standings
                        constructor_standings[team] = constructor_standings.get(team, 0) + points
            except Exception as e:
                # Silently continue if a past race fails to load
                continue

        if driver_standings:
            max_driver_points = max(driver_standings.values()) if driver_standings else 0
        if constructor_standings:
             max_constructor_points = max(constructor_standings.values()) if constructor_standings else 0

    except Exception as e:
        print(f"Error calculating standings for {year} before round {target_round}: {e}")
        return {}, 0, {}, 0

    return driver_standings, max_driver_points, constructor_standings, max_constructor_points

def get_race_data(year, event_identifier):
    """Get Race and Qualifying data for a specific event."""
    try:
        session_r = api_call_with_rate_limit(fastf1.get_session, year, event_identifier, 'R')
        session_r.load(laps=False, telemetry=False, weather=False, messages=False)
        results_r = session_r.results

        if results_r is None or results_r.empty:
            print(f"No race results found for {year} {event_identifier}. Skipping.")
            return None

        results_r['FinishingPosition'] = pd.to_numeric(results_r['Position'], errors='coerce')
        results_r['GridPosition'] = pd.to_numeric(results_r['GridPosition'], errors='coerce')  # Add GridPosition
        results_r.dropna(subset=['FinishingPosition', 'GridPosition'], inplace=True)  # Ensure both positions are valid
        results_r[['FinishingPosition', 'GridPosition']] = results_r[['FinishingPosition', 'GridPosition']].astype(int)

        session_q = api_call_with_rate_limit(fastf1.get_session, year, event_identifier, 'Q')
        session_q.load(laps=False, telemetry=False, weather=False, messages=False)
        results_q = session_q.results

        if results_q is None or results_q.empty:
            print(f"No qualifying results found for {year} {event_identifier}. Proceeding without Quali times.")
            qualifying_times = pd.DataFrame({'Abbreviation': results_r['Abbreviation'], 'QualifyingTime (s)': np.nan})
        else:
            results_q['QualifyingTime'] = results_q[['Q1', 'Q2', 'Q3']].min(axis=1)
            results_q['QualifyingTime (s)'] = results_q['QualifyingTime'].dt.total_seconds()
            qualifying_times = results_q[['Abbreviation', 'QualifyingTime (s)']].copy()
        
        race_data = pd.merge(results_r[['Abbreviation', 'TeamName', 'FinishingPosition', 'GridPosition']], 
                             qualifying_times, on='Abbreviation', how='left')
        
        race_data['Year'] = year
        race_data['RoundNumber'] = session_r.event['RoundNumber']
        race_data['EventName'] = session_r.event['EventName']
        
        # Add driver characteristics
        for idx, row in race_data.iterrows():
            driver_code = row['Abbreviation']
            driver_features = get_driver_features(driver_code)
            for feature_name, feature_value in driver_features.items():
                race_data.at[idx, feature_name] = feature_value
        
        # Calculate Points Index *before* this race using the updated function
        driver_standings, max_driver_points, constructor_standings, max_constructor_points = get_standings_before_round(year, race_data['RoundNumber'].iloc[0])
        
        # Calculate Driver Points Index
        if max_driver_points > 0:
            race_data['PointsIndex'] = race_data['Abbreviation'].apply(lambda x: driver_standings.get(x, 0) / max_driver_points)
        else:
            race_data['PointsIndex'] = 0.0
            
        # Calculate Constructor Points Index
        if max_constructor_points > 0:
            race_data['ConstructorPointsIndex'] = race_data['TeamName'].apply(lambda x: constructor_standings.get(x, 0) / max_constructor_points)
        else:
             race_data['ConstructorPointsIndex'] = 0.0
             
        race_data.rename(columns={'Abbreviation': 'Driver', 'TeamName': 'Team'}, inplace=True)
        
        mean_quali_time = race_data['QualifyingTime (s)'].mean()
        race_data['QualifyingTime (s)'].fillna(mean_quali_time, inplace=True)
        
        # Ensure no NaNs remain in core features
        core_features = ['Driver', 'Team', 'FinishingPosition', 'QualifyingTime (s)', 
                         'PointsIndex', 'ConstructorPointsIndex', 'GridPosition']
        driver_feature_keys = list(get_driver_features('VER').keys())  # Get a sample of driver feature keys
        core_features.extend(driver_feature_keys)
        race_data.dropna(subset=core_features, inplace=True)
        
        # Include driver features in the returned columns
        return_columns = ['Year', 'EventName', 'Driver', 'Team', 'QualifyingTime (s)', 
                          'PointsIndex', 'ConstructorPointsIndex', 'GridPosition', 'FinishingPosition']
        return_columns.extend(driver_feature_keys)
        
        return race_data[return_columns]
        
    except Exception as e:
        print(f"Error processing {year} {event_identifier}: {e}")
        return None

# --- Configuration ---
TARGET_YEAR = 2025
TARGET_GP_NAME = 'Japanese' 
HISTORICAL_YEARS = range(TARGET_YEAR - 1, 2017, -1) 

# --- Data Gathering ---
all_race_data = []

# 1. Get Target GP Info
try:
    schedule_target_year = api_call_with_rate_limit(fastf1.get_event_schedule, TARGET_YEAR, include_testing=False)
    target_event = schedule_target_year[schedule_target_year['EventName'].str.contains(TARGET_GP_NAME, case=False, na=False)]
    
    if target_event.empty:
        raise ValueError(f"Target GP '{TARGET_GP_NAME}' not found in {TARGET_YEAR} schedule.")
        
    target_round = target_event['RoundNumber'].iloc[0]
    target_venue = target_event['Location'].iloc[0]
    target_event_name = target_event['EventName'].iloc[0] # Official name for consistency
    print(f"Target Event: {TARGET_YEAR} {target_event_name} (Round {target_round}, Venue: {target_venue})")

except Exception as e:
    raise ValueError(f"Error finding target GP info: {e}")

# 2. Get Data from Current Season (up to target GP)
print(f"\n--- Fetching {TARGET_YEAR} data before Round {target_round} ---")
schedule_current_year_filtered = schedule_target_year[schedule_target_year['RoundNumber'] < target_round]
for index, race in schedule_current_year_filtered.iterrows():
    print(f"Processing {TARGET_YEAR} {race['EventName']}...")
    data = get_race_data(TARGET_YEAR, race['EventName'])
    if data is not None:
        all_race_data.append(data)

# 3. Get Historical Data
print(f"\n--- Fetching Historical Data ---")
for year in HISTORICAL_YEARS:
    try:
        schedule_hist = api_call_with_rate_limit(fastf1.get_event_schedule, year, include_testing=False)
        print(f"Processing all races for {year}...")
        # Iterate through all races in the historical year's schedule
        for index, race_hist in schedule_hist.iterrows():
            event_name_hist = race_hist['EventName']
            print(f"  Processing {year} {event_name_hist}...")
            data = get_race_data(year, event_name_hist)
            if data is not None:
                all_race_data.append(data)
    except Exception as e:
        print(f"Error getting schedule or processing races for {year}: {e}")

# 4. Combine Data
if not all_race_data:
    raise ValueError("No data could be collected. Check FastF1 setup and GP/Year validity.")

combined_data = pd.concat(all_race_data, ignore_index=True)

# Drop the RoundNumber column
if 'RoundNumber' in combined_data.columns:
    combined_data.drop(columns=['RoundNumber'], inplace=True)
    print("Dropped 'RoundNumber' column.")
else:
    print("'RoundNumber' column not found in combined_data.")

print(f"\n--- Combined Data Shape after dropping RoundNumber: {combined_data.shape} ---")
print(combined_data.head())
print(combined_data.info())
print(combined_data.describe())

# Export the combined dataset to CSV
try:
    csv_filename = 'combined_f1_data.csv'
    combined_data.to_csv(csv_filename, index=False)
    print(f"\n--- Full Combined Dataset exported to {csv_filename} ---")
except Exception as e:
    print(f"\nError exporting data to CSV: {e}")

# 5. Prepare Data for Target Race Prediction
print(f"\n--- Preparing data for {TARGET_YEAR} {target_event_name} prediction ---")
try:
    # Get Qualifying data for the actual target race
    target_q_session = api_call_with_rate_limit(fastf1.get_session, TARGET_YEAR, target_event_name, 'Q')
    target_q_session.load(laps=False, telemetry=False, weather=False, messages=False)
    target_q_results = target_q_session.results
    
    if target_q_results is None or target_q_results.empty:
         raise ValueError(f"Could not load Qualifying results for target event: {TARGET_YEAR} {target_event_name}")
         
    target_q_results['QualifyingTime'] = target_q_results[['Q1', 'Q2', 'Q3']].min(axis=1)
    target_q_results['QualifyingTime (s)'] = target_q_results['QualifyingTime'].dt.total_seconds()
    
    # Get points index before the target race using the updated function
    target_driver_standings, target_max_driver_points, target_constructor_standings, target_max_constructor_points = get_standings_before_round(TARGET_YEAR, target_round)
    
    # Create DataFrame for prediction
    predict_df = target_q_results[['Abbreviation', 'TeamName', 'QualifyingTime (s)']].copy()
    predict_df.rename(columns={'Abbreviation': 'Driver', 'TeamName': 'Team'}, inplace=True)
    
    # Calculate Driver Points Index for prediction set
    if target_max_driver_points > 0:
         predict_df['PointsIndex'] = predict_df['Driver'].apply(lambda x: target_driver_standings.get(x, 0) / target_max_driver_points)
    else:
         predict_df['PointsIndex'] = 0.0
         
    # Calculate Constructor Points Index for prediction set
    if target_max_constructor_points > 0:
         predict_df['ConstructorPointsIndex'] = predict_df['Team'].apply(lambda x: target_constructor_standings.get(x, 0) / target_max_constructor_points)
    else:
         predict_df['ConstructorPointsIndex'] = 0.0
         
    # Handle potential missing Quali times in prediction set (e.g., pit lane start)
    # Use mean from the combined *training* data to fill
    mean_quali_train = combined_data['QualifyingTime (s)'].mean()
    predict_df['QualifyingTime (s)'].fillna(mean_quali_train, inplace=True)
    
    # Sort the qualifying results by time to determine grid positions
    predict_df = predict_df.sort_values('QualifyingTime (s)')
    predict_df['GridPosition'] = range(1, len(predict_df) + 1)
    
    # Add driver features to the prediction dataframe for the target race
    for idx, row in predict_df.iterrows():
        driver_code = row['Driver']
        driver_features = get_driver_features(driver_code)
        for feature_name, feature_value in driver_features.items():
            predict_df.at[idx, feature_name] = feature_value
    
    # Ensure no NaNs in prediction features
    predict_df.dropna(subset=['QualifyingTime (s)', 'PointsIndex', 'ConstructorPointsIndex'], inplace=True)
    
    # Keep track of drivers for final output
    predict_drivers = predict_df['Driver'].tolist()
    
    print(f"Prediction input data shape: {predict_df.shape}")
    print(predict_df)
    
except Exception as e:
    raise ValueError(f"Error preparing prediction data: {e}")

In [None]:


features = ['GridPosition', 'QualifyingTime (s)', 'PointsIndex', 'ConstructorPointsIndex','QualifyingPace', 'RaceCraft', 'Consistency', 'Aggression', 'TireManagement']
target = 'FinishingPosition'

X = combined_data[features].copy()
y = combined_data[target].copy()

predict_X = predict_df[features].copy()


print(predict_X.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

model.fit(X_train, y_train)

predicted_scores = model.predict(predict_X)

#Create Ranked Results
#Create a DataFrame with drivers and their raw predicted scores (lower score is better)
results_df = pd.DataFrame({
    'Driver': predict_drivers, 
    'PredictedScore': predicted_scores
})

results_df = results_df.sort_values(by='PredictedScore')
# Assign ranks 1, 2, 3... based on the sorted order
results_df['PredictedFinishingPosition'] = np.arange(1, len(results_df) + 1)


print(f"\n🏁 Predicted Finishing Order for {TARGET_YEAR} {target_event_name} 🏁\n")
print(results_df[['Driver', 'PredictedFinishingPosition']].to_string(index=False))

# --- Evaluate Model on Test Set ---
print("\n--- Evaluating Model Performance ---")
y_pred_test = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_test)
print(f"🔍 Model Evaluation (Mean Absolute Error on test set): {mae:.2f} positions")
print("(Lower MAE is better. This indicates the average error in predicted position on unseen historical data)")

# --- Feature Importance ---
print("\nFeature Importances:")
try:
    feature_names = X.columns
    importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(importance_df.to_string(index=False))
except Exception as e:
    print(f"Could not calculate or display feature importances: {e}")