# FastF1 Data Processing

In [1]:
#%pip install fastf1 pandas scikit-learn torch
import fastf1
import pandas as pd
import torch

In [6]:
# event_schedule -> event -> session -> laps -> telemetry
def process_event(event, year):
    fastest_qualifying_lap_times = {}

    qualifying = event.get_qualifying()
    qualifying.load()

    for driver in pd.unique(qualifying.results["Abbreviation"]):
        fastest_qualifying_lap_times[driver] = qualifying.laps.pick_driver(driver).pick_fastest().LapTime

    race = event.get_race()
    race.load()

    race_laps = race.laps.reset_index(drop=True)
    weather_data = race.laps.get_weather_data().reset_index(drop=True)
    race_laps = pd.concat([race_laps, weather_data.loc[:, ~(weather_data.columns == 'Time')]], axis=1)

    race_laps = race_laps[[
        "LapTime",
        "Driver",
        "LapNumber", # TODO does this need to be normalized
        "Stint",
        "Compound",
        "TyreLife",
        "Team",
        "TrackStatus",
        "Position",
        "Rainfall",
        "AirTemp",
        "TrackTemp"
    ]]

    # Qualifying results
    race_laps["FastestQualifyingLapTime"] = race_laps.apply(lambda row: fastest_qualifying_lap_times[row["Driver"]], axis=1)
    race_laps["FastestQualifyingLapTime"] = race_laps["FastestQualifyingLapTime"].dt.total_seconds() * 1000

    # Circuit and year
    race_laps["Event"] = event.EventName
    race_laps["Year"] = year

    return race_laps


event = fastf1.get_event(2022, 'Bahrain')
data = process_event(event, 2022)
data.to_csv("bahrain-2022.csv")


core           INFO 	Loading data for Bahrain Grand Prix - Qualifying [v3.3.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['16', '1', '55', '11', '44', '77', '20', '14', '63', '10', '31', '47', '4', '23', '24', '22', '27', '3', '18', '6']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.3.0]
req            INFO 	Using cached da

In [8]:
data = pd.DataFrame()

for year in range(2019, 2024):
    event_schedule = fastf1.get_event_schedule(year, include_testing=False)

    event_round = 0
    while True:
        event_round += 1

        try:
            event = event_schedule.get_event_by_round(event_round)
            event_data = process_event(event, year)

            data = pd.concat([data, event_data])
        except:
            break

event_data = data

# Process time data before saving as CSV
event_data["LapTime"] = event_data["LapTime"].dt.total_seconds() * 1000
event_data["FastestQualifyingLapTime"] = event_data["FastestQualifyingLapTime"].dt.total_seconds() * 1000

event_data.to_csv("event-data.csv")

In [28]:
event_data = pd.read_csv("event-data.csv").iloc[:, 1:]
processed_data = event_data

# One-hot encode driver
processed_data["Driver_Orig"] = processed_data["Driver"]
processed_data = pd.get_dummies(processed_data, columns=["Driver"], dtype=int)

# One-hot encode team
processed_data = pd.get_dummies(processed_data, columns=["Team"], dtype=int)

# One-hot encode position
processed_data = pd.get_dummies(processed_data, columns=["Position"], dtype=int)

# One-hot encode compound
processed_data = pd.get_dummies(processed_data, columns=["Compound"], dtype=int)

# One-hot encode event
processed_data["Event_Orig"] = processed_data["Event"]
processed_data = pd.get_dummies(processed_data, columns=["Event"], dtype=int)

# Convert Rainfall boolean to int
processed_data["Rainfall"] = processed_data["Rainfall"].astype(int)

# Encode track status
def track_status_apply(row):
    track_status = str(row["TrackStatus"])

    clear = "1" in track_status
    yellow = "2" in track_status
    safety_car = "4" in track_status
    red_flag = "5" in track_status
    virtual_safety_car_deployed = "6" in track_status
    virtual_safety_car_ending = "7" in track_status

    encoding = [clear, yellow, safety_car, red_flag, virtual_safety_car_deployed, virtual_safety_car_ending]
    return [float(i) for i in encoding]

processed_data[[
    "TrackStatus_Clear",
    "TrackStatus_YellowFlag",
    "TrackStatus_SafetyCar",
    "TrackStatus_RedFlag",
    "TrackStatus_VirtualSafetyCarDeployed",
    "TrackStatus_VirtualSafetyCarEnding"
]] = processed_data.apply(track_status_apply, axis=1, result_type="expand")
processed_data = processed_data.drop(columns=["TrackStatus"])

# Save processed data
processed_data.to_csv("processed-data.csv")

In [2]:
df = pd.read_csv("processed-data.csv").iloc[:, 1:]
    
def generate_ngrams(df, n, columns_to_drop=['Driver_Orig', 'Event_Orig']):
    """generate_ngrams creates n-grams from the input dataframe of consecutive laps
    from a driver at a single event.

    columns_to_drop: list of columns to drop from the n-gram (not tensor friendly)

    Returns a Pandas dataframe of the n-grams
    """
    ngrams_list = []

    grouped = df.groupby(['Year', 'Event_Orig', 'Driver_Orig'])
    for _, group in grouped:
        if columns_to_drop:
            group = group.drop(columns=columns_to_drop, errors='ignore')

        sorted_group = group.sort_values(by='LapNumber')

        for i in range(len(sorted_group) - n + 1):
            potential_ngram = sorted_group.iloc[i:i + n]

            lap_numbers = potential_ngram['LapNumber'].to_list()
            # Check if the n-gram laps are consecutive
            if lap_numbers == list(range(int(min(lap_numbers)), int(min(lap_numbers)) + n)):
                ngrams_list.append(potential_ngram.values.flatten())

    col_names = [f'{col}_{i+1}' for i in range(n) for col in group]
    ngrams_df = pd.DataFrame(ngrams_list, columns=col_names)

    return ngrams_df


ngrams_df = generate_ngrams(df, 5)
ngrams_df.to_csv("ngrams.csv")

In [3]:
ngrams_df = pd.read_csv("ngrams.csv").iloc[:, 1:]
tensor = torch.tensor(ngrams_df.values, dtype=torch.float32)
print(tensor.shape)

torch.Size([105664, 620])
