### Building a Formula 1 Podium Prediction Model from Scratch in PyTorch

Using PyTorch, we can build a modular machine learning model. We can scale up the model to run on a GPU, or multiple. Furthermore, we can swap in and out different data transformation methods, neural network structures, and operationalize model predictions.

---

#### Dependencies

Let's load our dependencies. It's best to import the specific PyTorch modules we need; a full `import torch` could take a while and use up precious memory. Always be concious of time and memory when training models - renting GPUs ain't cheap.

In [11]:
# Dependencies
import pandas as pd
import fastf1 as ff1
import pandas as pd

from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from preprocessing import F1Session

#### Load Data from FastF1
Enabling the cache ensures that race data isn't loaded from scratch if you need to restart the kernel. As a proof of concept, we'll use the 2022 season to train and the 2023 season (so far) to test. All of the data loading and preprocessing logic is in the __preprocessing.py__ file to keep this notebook as straightforward as possible.

In [12]:
# Enable fastf1 cache
ff1.Cache.enable_cache('cache')

In [13]:
training_races = [
    {'year': 2022, 'circuit': 1, 'session_type': 'R'},
    {'year': 2022, 'circuit': 2, 'session_type': 'R'},
    {'year': 2022, 'circuit': 3, 'session_type': 'R'},
    {'year': 2022, 'circuit': 4, 'session_type': 'R'},
    {'year': 2022, 'circuit': 5, 'session_type': 'R'},
    {'year': 2022, 'circuit': 6, 'session_type': 'R'},
    {'year': 2022, 'circuit': 7, 'session_type': 'R'},
    {'year': 2022, 'circuit': 8, 'session_type': 'R'},
    {'year': 2022, 'circuit': 9, 'session_type': 'R'},
    {'year': 2022, 'circuit': 10, 'session_type': 'R'},
    {'year': 2022, 'circuit': 11, 'session_type': 'R'},
    {'year': 2022, 'circuit': 12, 'session_type': 'R'},
    {'year': 2022, 'circuit': 13, 'session_type': 'R'},
    {'year': 2022, 'circuit': 14, 'session_type': 'R'},
    {'year': 2022, 'circuit': 15, 'session_type': 'R'},
    {'year': 2022, 'circuit': 16, 'session_type': 'R'},
    {'year': 2022, 'circuit': 17, 'session_type': 'R'},
    {'year': 2022, 'circuit': 18, 'session_type': 'R'},
    {'year': 2022, 'circuit': 19, 'session_type': 'R'},
    {'year': 2022, 'circuit': 20, 'session_type': 'R'},
    {'year': 2022, 'circuit': 21, 'session_type': 'R'},
    {'year': 2022, 'circuit': 22, 'session_type': 'R'},
]

testing_races = training_races = [
    {'year': 2023, 'circuit': 1, 'session_type': 'R'},
    {'year': 2023, 'circuit': 2, 'session_type': 'R'},
    {'year': 2023, 'circuit': 3, 'session_type': 'R'},
    {'year': 2023, 'circuit': 4, 'session_type': 'R'},
    {'year': 2023, 'circuit': 5, 'session_type': 'R'},
    {'year': 2023, 'circuit': 6, 'session_type': 'R'},
    {'year': 2023, 'circuit': 7, 'session_type': 'R'},
    {'year': 2023, 'circuit': 8, 'session_type': 'R'},
]

In [14]:
# Load the races into a session
training_session = F1Session(training_races)

# Iterate through the session to get the preprocessed data for each race
data_list = [
    preprocessed_data for preprocessed_data in training_session if preprocessed_data is not None]

# Concatenate all the dataframes into a single dataframe
training_data = pd.concat(data_list, ignore_index=True)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.0.4]
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.0.4]
req            INFO 	Using cached data for driver_info

In [15]:
# Load the races into a session
testing_session = F1Session(testing_races)

# Iterate through the session to get the preprocessed data for each race
data_list = [
    preprocessed_data for preprocessed_data in testing_session if preprocessed_data is not None]

# Concatenate all the dataframes into a single dataframe
testing_data = pd.concat(data_list, ignore_index=True)

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.0.4]
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data


req            INFO 	Using cached data for timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.0.4]
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for timing_data
req            INFO 	Using cached data for timing_app_data
core         

In [1]:
training_data.head()

NameError: name 'training_data' is not defined

#### Label Encoding & Normalization


In [16]:
le = LabelEncoder()
for col in ['Driver', 'Team']:
    training_data[col] = le.fit_transform(training_data[col])
    # use the same encoder to transform testing data
    testing_data[col] = le.transform(testing_data[col])

In [17]:
# Normalizing numerical variables
scaler = MinMaxScaler()
for col in ['Year', 'Circuit', 'LapTime']:
    training_data[col] = scaler.fit_transform(
        training_data[col].values.reshape(-1, 1))
    testing_data[col] = scaler.transform(
        testing_data[col].values.reshape(-1, 1))

In [18]:
from torch import FloatTensor

# Define Dataset
class F1Dataset(Dataset):
    def __init__(self, data):
        self.X = torch.tensor(
            data.drop('Podium', axis=1).values, dtype=torch.float32)
        self.y = torch.tensor(data['Podium'].values, dtype=torch.float32)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.X)

In [20]:
from torch.utils.data import DataLoader

train_dataset = F1Dataset(training_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = F1Dataset(testing_data)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [21]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


Using cpu device


In [50]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(5, 512),   # input size is 5 based on your dataset
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),   # we're predicting a single output
            nn.Sigmoid()   # add Sigmoid activation function
        )


    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        output = torch.sigmoid(logits)  # Apply sigmoid activation
        return output


In [51]:
# Define loss and optimizer
model = NeuralNetwork().to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [52]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred.squeeze(), y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


In [53]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred.squeeze(), y).item()
            # Convert predictions to binary labels
            pred_labels = (pred > 0.5).type(torch.float)
            correct += (pred_labels == y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(
        f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [54]:
# Run the training and test loop
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------


RuntimeError: all elements of input should be between 0 and 1

In [55]:
training_data['Podium'].value_counts()

Podium
0    136
1     24
Name: count, dtype: int64