In [89]:
import requests, json, os, joblib
from bs4 import BeautifulSoup
import pandas as pd, numpy as np
from datetime import date

In [90]:
import kagglehub

# Download latest version
kaggle_path = kagglehub.dataset_download("rohanrao/formula-1-world-championship-1950-2020")

print("Path to dataset files:", kaggle_path)

Path to dataset files: C:\Users\yarno\.cache\kagglehub\datasets\rohanrao\formula-1-world-championship-1950-2020\versions\24


In [91]:
with open('data/links.json', 'r', encoding='utf-8') as f:
    LINKS = json.load(f)
print(LINKS)

{'Australia': {'id': 1, 'day': 75, 'url_quali': 'https://www.formula1.com/en/results/2025/races/1254/australia/qualifying', 'url_race': 'https://www.formula1.com/en/results/2025/races/1254/australia/race-result'}, 'China': {'id': 17, 'day': 82, 'url_quali': 'https://www.formula1.com/en/results/2025/races/1255/china/qualifying', 'url_race': 'https://www.formula1.com/en/results/2025/races/1255/china/race-result'}, 'Japan': {'id': 22, 'day': 96, 'url_quali': 'https://www.formula1.com/en/results/2025/races/1256/japan/qualifying', 'url_race': 'https://www.formula1.com/en/results/2025/races/1256/japan/race-result'}, 'Bahrain': {'id': 3, 'day': 103, 'url_quali': 'https://www.formula1.com/en/results/2025/races/1257/bahrain/qualifying', 'url_race': 'https://www.formula1.com/en/results/2025/races/1257/bahrain/race-result'}, 'Saudi Arabia': {'id': 77, 'day': 110, 'url_quali': 'https://www.formula1.com/en/results/2025/races/1258/saudi-arabia/qualifying', 'url_race': 'https://www.formula1.com/en/re

In [92]:
class Scraper:
  def __init__(self):
    self.qualifying_2025 = pd.DataFrame(columns=["TrackId", "Code", "Team", "Q1", "Q2", "Q3", "Grid"])
    self.races_2025 = pd.DataFrame(columns=["TrackId", "Code", "Position"])
    self.last_scraped = 0

    if os.path.exists("metadata.json"):
       with open("data/metadata.json", "r", encoding="utf-8") as f:
         self.today = json.load(f)
    else:
       self.today = 0

  def extract_qualifying(self):
      for race in LINKS.values():
          if race['day'] > self.today or race['day'] <= self.last_scraped:
             break
          path = race["url_quali"]
          soup = BeautifulSoup(requests.get(path).text, "lxml")
          table = soup.find("tbody")
          rows = table.find_all("tr")

          for row in rows:
            grid, _, name, team, q1, q2, q3, _ = [val.text for val in row.find_all("td")]
            name, code = name[:-3], name[-3:]
            self.qualifying_2025.loc[len(self.qualifying_2025)] = [race['id'], code, team, q1, q2, q3, grid]

    
  def extract_races(self):
      for race in LINKS.values():
          if race['day'] > self.today or race['day'] <= self.last_scraped:
             break
          path = race["url_race"]
          soup = BeautifulSoup(requests.get(path).text, "lxml")
          table = soup.find("tbody")
          rows = table.find_all("tr")

          for row in rows:
            position, _, name, _, _, _, _= [val.text for val in row.find_all("td")]
            code = name[-3:]
            self.races_2025.loc[len(self.races_2025)] = [int(race['id']), code, position]

  def scrape(self, today):
     self.today = today
     self.extract_qualifying()
     self.extract_races()
     self.last_scraped = today
     print(self.last_scraped)

  def merge_current(self):
     return pd.merge(self.qualifying_2025, self.races_2025, how="right", on=["TrackId", "Code"])
  
  def load(self, df):
     self.old_df = df

  def save(self):
     pd.concat([self.old_df, self.merge_current()]).to_csv("data/season_2025.csv", index=False)
     with open("data/metadata.json", "w", encoding="utf-8") as f:
        json.dump(self.last_scraped, f)

  def check(self):
      try:
         df = pd.read_csv("data/season_2025.csv")
      except FileNotFoundError:
         df = pd.DataFrame()
         
      self.load(df)
      self.scrape(int(date.today().strftime("%j")))
      self.save()

def remove_2025_season():
    if os.path.exists("data/metadata.json"):
        os.remove("data/metadata.json")
    if os.path.exists("data/season_2025.csv"):
        os.remove("data/season_2025.csv")

remove_2025_season()

# Update 2025 Database

In [93]:
Scraper().check()

235


# Las Vegas

In [94]:

lv_qualis = pd.DataFrame(columns=["TrackId", "Code", "Team", "Q1", "Q2", "Q3", "Grid"])
lv_races = pd.DataFrame(columns=["TrackId", "Code", "Position"])

for path in ["https://www.formula1.com/en/results/2023/races/1225/las-vegas/qualifying", "https://www.formula1.com/en/results/2024/races/1250/las-vegas/qualifying"]:
    soup = BeautifulSoup(requests.get(path).text, "lxml")
    table = soup.find("tbody")
    rows = table.find_all("tr")

    for row in rows:
        grid, _, name, team, q1, q2, q3, _ = [val.text for val in row.find_all("td")]
        code = name[-3:]
        lv_qualis.loc[len(lv_qualis)] = [44, code, team, q1, q2, q3, grid]


for path in ["https://www.formula1.com/en/results/2023/races/1225/las-vegas/race-result", "https://www.formula1.com/en/results/2024/races/1250/las-vegas/race-result"]:
    soup = BeautifulSoup(requests.get(path).text, "lxml")
    table = soup.find("tbody")
    rows = table.find_all("tr")

    for row in rows:
        position, _, name, _, _, _, _= [val.text for val in row.find_all("td")]
        code = name[-3:]
        lv_races.loc[len(lv_races)] = [44, code, position]

season_2025 = pd.read_csv("data/season_2025.csv")
lv = pd.merge(lv_qualis, lv_races, how="right", on=["TrackId", "Code"])
lv = lv.loc[lv.Code.isin(season_2025.Code.unique())]

pd.concat([season_2025, lv]).to_csv("data/season_2025.csv", index=False)

# From Kaggle

In [95]:
all_qualis = pd.read_csv(f"{kaggle_path}/qualifying.csv", usecols=['raceId', 'driverId', 'constructorId', 'q1', 'q2', 'q3'])
races = pd.read_csv(f"{kaggle_path}/races.csv", usecols=['raceId', 'circuitId'])
drivers = pd.read_csv(f"{kaggle_path}/drivers.csv", usecols=['driverId', 'code'])
results = pd.read_csv(f"{kaggle_path}/results.csv", usecols=['raceId', 'driverId', 'position', 'grid'])
constructors = pd.read_csv(f"{kaggle_path}/constructors.csv", usecols=["constructorId", "name"])
circuits = pd.read_csv(f"{kaggle_path}/circuits.csv")

w = pd.merge(all_qualis, results, how="left", on=["raceId", "driverId"])
x = pd.merge(w, races, how="left", on=["raceId"]).drop(["raceId"], axis=1).rename(columns={
    "circuitId":"TrackId",
    "q1":"Q1",
    "q2":"Q2",
    "q3":"Q3",
    "grid":"Grid",
    "position":"Position"
})
y = pd.merge(x, drivers, how="left", on=["driverId"]).rename(columns={"code":"Code"})
z = pd.merge(y, constructors, how="right", on=["constructorId"]).drop(["constructorId"], axis=1).rename(columns={"name":"Team"})
z[["TrackId", "Code", "Team", "Q1", "Q2", "Q3", "Grid", "Position", "driverId"]]
z = z.loc[~((z.driverId == 818) & (z.Code == "VER"))]


z[["TrackId", "Code", "Team", "Q1", "Q2", "Q3", "Grid", "Position"]].loc[z.Code.isin(pd.read_csv("data/season_2025.csv").Code)].reset_index(drop=True).to_csv("data/previous_seasons.csv", index=False)


In [97]:
if os.path.exists("data/previous_seasons.csv"):
    pd.concat([pd.read_csv("data/previous_seasons.csv"), pd.read_csv("data/season_2025.csv")]).to_csv("data/all_seasons.csv", index=False)
    os.remove("data/previous_seasons.csv")

## Standardize Team Names

In [98]:
path = "data/all_seasons.csv"
df = pd.read_csv(path)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def convert_time(s):
    if isinstance(s, float):
        return s
    elif not isinstance(s, str) or len(s) < 3:
        return None
    else :
        if len(s) == len("1:00:000"):
            return float(int(s[-3:]))/1000 + int(s[-6:-4]) + 60 * int(s[0])
        
categorical_features = ["TrackId", "Code", "Team"]
numerical_features = ["Q1", "Q2", "Q3", "Grid"]

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder())
])

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_pipeline, categorical_features),
        ("num", numeric_pipeline, numerical_features),
    ]
)

def preprocess(df, fit = False):
    team_map = {
        'Alpine F1 Team': "Alpine",
        'Haas F1 Team': "Haas",
        'Toro Rosso': "Racing Bulls",
        'Red Bull Racing': "Red Bull",
        'RB F1 Team': "Red Bull",
        'Racing Point': "Aston Martin",
        'Red Bull Racing Honda RBPT': "Red Bull",
        'Alpine Renault': "Alpine",
        'Aston Martin Aramco Mercedes':'Aston Martin',
        'McLaren Mercedes':"McLaren",
        'Williams Mercedes': "Williams",
        'AlphaTauri Honda RBPT': "Racing Bulls",
        'Haas Ferrari':"Ferrari",
        'RB Honda RBPT':"Red Bull"
    }
    
    df.Team = df.Team.replace(team_map)
    if fit:
        df.Position = df.Position.replace({'\\N':21, 'DQ':21, "NC":21}).astype(int)
    df.TrackId = df.TrackId.astype(int)
    df.Grid = df.Grid.replace({'\\N':21, 'DQ':21, "NC":21}).astype(float).astype(int)
    df.Q1 = df.Q1.apply(convert_time)
    df.Q2 = df.Q2.apply(convert_time)
    df.Q3 = df.Q3.apply(convert_time)

    if fit:
        df.Position = df.Position.astype(int).astype(float)
        X, y = df.drop(["Position"], axis=1), df.Position
        return preprocessor.fit_transform(X), y
    else :
        return preprocessor.transform(df)
    
X, y = preprocess(df, fit=True)

joblib.dump(preprocessor, "models/preprocessor_post.pkl")

['models/preprocessor_post.pkl']

In [99]:
path = "data/all_seasons.csv"
df = pd.read_csv(path, usecols=["TrackId", "Code", "Team", "Position"])
        
categorical_features = ["TrackId", "Code", "Team"]

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder())
])

preprocessor2 = ColumnTransformer(
    transformers=[
        ("cat", categorical_pipeline, categorical_features)
    ]
)


def preprocess2(df, fit = False):
    team_map = {
        'Alpine F1 Team': "Alpine",
        'Haas F1 Team': "Haas",
        'Toro Rosso': "Racing Bulls",
        'Red Bull Racing': "Red Bull",
        'RB F1 Team': "Red Bull",
        'Racing Point': "Aston Martin",
        'Red Bull Racing Honda RBPT': "Red Bull",
        'Alpine Renault': "Alpine",
        'Aston Martin Aramco Mercedes':'Aston Martin',
        'McLaren Mercedes':"McLaren",
        'Williams Mercedes': "Williams",
        'AlphaTauri Honda RBPT': "Racing Bulls",
        'Haas Ferrari':"Ferrari",
        'RB Honda RBPT':"Red Bull"
    }
    
    df.Team = df.Team.replace(team_map)
    if fit:
        df.Position = df.Position.replace({'\\N':21, 'DQ':21, "NC":21}).astype(int)
    df.TrackId = df.TrackId.astype(int)

    if fit:
        df.Position = df.Position.astype(int).astype(float)
        X, y = df.drop(["Position"], axis=1), df.Position
        return preprocessor2.fit_transform(X), y
    else :
        return preprocessor2.transform(df)
    
X_no_time, y_no_time = preprocess2(df, fit=True)

joblib.dump(preprocessor2, "models/preprocessor_pre.pkl")

['models/preprocessor_pre.pkl']

In [100]:
nn_post_quali = joblib.load("models/nn_post_quali.pkl")
nn_pre_quali = joblib.load("models/nn_pre_quali.pkl")

rf_post_quali = joblib.load("models/rf_post_quali.pkl")
rf_pre_quali = joblib.load("models/rf_pre_quali.pkl")

# Validation

In [101]:
data_pre = pd.read_csv("data/season_2025.csv", usecols=["TrackId", "Code", "Team", "Position"])
data_post = pd.read_csv("data/season_2025.csv")

def accuracy_pre(model, TrackId, data):
    pred = model.predict(preprocess2(data.loc[data.TrackId == TrackId].drop(["Position"], axis=1)))
    ranking = data.loc[data.TrackId == TrackId][["Code", "Position"]].reset_index(drop=True).join(pd.DataFrame(pred, columns=["Pred"]).reset_index(drop=True), how="inner").sort_values(by="Pred").reset_index(drop=True)
    accu = 0
    for index, row in ranking.iterrows():
        pos = int(row.Position) if row.Position not in ["NC", "DQ"] else 21
        accu += abs(pos - int(index))
    return accu / 20

def accuracy_post(model, TrackId, data):
    pred = model.predict(preprocess(data.loc[data.TrackId == TrackId].drop(["Position"], axis=1)))
    ranking = data.loc[data.TrackId == TrackId][["Code", "Position"]].reset_index(drop=True).join(pd.DataFrame(pred, columns=["Pred"]).reset_index(drop=True), how="inner").sort_values(by="Pred").reset_index(drop=True)
    accu = 0
    for index, row in ranking.iterrows():
        pos = int(row.Position) if row.Position not in ["NC", "DQ"] else 21
        accu += abs(pos - int(index))
    return accu / 20


with open("data/links.json", "r", encoding="utf-8") as f:
    LINKS = json.load(f)

today = int(date.today().strftime("%j"))

def compute_net_accu(model, accu_fct, dataset):
    vals = []
    for link_set in LINKS.values():
        if link_set['day'] > today:
            break
        vals.append(accu_fct(model, link_set['id'], dataset))
    return np.array(vals).mean()

print("Neural Network")
print(compute_net_accu(nn_pre_quali, accuracy_pre, data_pre))
print(compute_net_accu(nn_post_quali, accuracy_post, data_post))
print("Random Forest")
print(compute_net_accu(rf_pre_quali, accuracy_pre, data_pre))
print(compute_net_accu(rf_post_quali, accuracy_post, data_post))

Neural Network
2.792857142857143
1.6642857142857144
Random Forest
2.6785714285714293
1.4214285714285713
