In [None]:
!pip install -q pandas numpy xgboost scikit-learn

In [None]:
#import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
#statsf1.com for tracks and driver data
#https://www.formula1points.com/season for driver points graph
#sample dataset for charles leclerc through monaco 2024
data = {
    #driver season performance
    #these are current values that reflect what has happened before the race
    'Track': ['bahrain', 'jeddah', 'austrailia', 'japan', 'china', 'miami', 'imola', 'monaco'],
    'Qualifying_Position': [2, 2, 4, 8, 6, 2, 3, 1],
    'Driver_Points_This_Season': [0, 12, 28, 47, 59, 76, 98, 113],
    'Average_Qualifying_Position_Last_5_Races': [0, 2, 2, 2.6, 4, 4.4, 4.4, 4.6],
    'Average_Finishing_Position_Last_5_Races': [0, 4, 3.5, 3, 3.25, 3.4, 3.2, 3.2],

    #track specific information
    'Weather': ['clear', 'clear', 'clear', 'clear', 'clear', 'clear', 'clear', 'clear'],
    'Track_Length': [5.412, 6.174, 5.278, 5.807, 5.451, 5.412, 4.909, 3.337],  #length in kilometers
    'Number_of_Turns': [15, 27, 14, 18, 16, 19, 19, 19],
    'Track_Speed': ['medium', 'high', 'medium', 'high', 'medium', 'medium', 'medium', 'low'],
    'Elevation_Changes': [15, 10, 6, 40, 11, 0, 30, 22],  #elevation change in meters
    'Surface_Grip': ['high', 'medium', 'medium', 'high', 'high', 'medium', 'high', 'high'],
    'Track_Type': ['permanent', 'street', 'street', 'permanent', 'permanent', 'street', 'permanent', 'street'],

    #driver historic track performance
    'Driver_Wins_At_Track': [1, 0, 1, 0, 0, 0, 0, 0],
    'Driver_Podiums_At_Track': [2, 1, 1, 1, 0, 1, 0, 0],
    'Driver_Avg_Qualifying_At_Track': [6.2, 6, 7.75, 4.5, 11.5, 4, 4.3, 7.4],
    'Driver_Avg_Finishing_Position_At_Track': [6.4, 5.3, 6.3, 4.3, 12, 4.5, 5, 5],
    'Driver_Races_At_Track': [7, 3, 4, 4, 2, 2, 3, 5],

    #target variable
    #since this is the taget, the values reflect what happened after the race
    'Finishing_Position': [4, 3, 2, 4, 4, 3, 3, 1]
}

In [None]:
#convert to data frame
df = pd.DataFrame(data)

In [None]:
#split features (X) and target (y)
X = df.drop(columns=['Finishing_Position'])
y = df['Finishing_Position']

In [None]:
#define categorical and numeric columns
categorical = ['Track', 'Weather', 'Track_Speed', 'Surface_Grip', 'Track_Type']
numeric = [
    'Qualifying_Position', 'Driver_Points_This_Season', 'Average_Qualifying_Position_Last_5_Races',
    'Average_Finishing_Position_Last_5_Races', 'Track_Length', 'Number_of_Turns',
    'Elevation_Changes', 'Driver_Wins_At_Track', 'Driver_Podiums_At_Track', 'Driver_Avg_Qualifying_At_Track',
    'Driver_Avg_Finishing_Position_At_Track', 'Driver_Races_At_Track'
]

In [None]:
#encode categorical variables and handle numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)  #added handle_unknown='ignore' because of trouble with track name not being present in testing data
    ]
)

In [None]:
#define the model (XGBoost Regressor)
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)

In [None]:
#create a pipeline for preprocessing to model training
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [None]:
#split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#train the model
pipeline.fit(X_train, y_train)

In [None]:
#make predictions on the test dataset
y_pred = pipeline.predict(X_test)

In [None]:
#evaluate the model using Root Mean Squared Error (rmse)
#example output: "RMSE: 1.002960205078125" this means that the prediction is within 1 place of where the driver actually finished
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

RMSE: 0.9971239566802979




In [None]:
#make a new dataset for new race prediction
#charles leclerc abu dhabi 2024
new_race = pd.DataFrame({
    #driver season performance
    'Track': [''],
    'Qualifying_Position': [],
    'Driver_Points_This_Season': [],
    'Average_Qualifying_Position_Last_5_Races': [],
    'Average_Finishing_Position_Last_5_Races': [],

    #track specific information
    'Weather': [''],
    'Tire_Choice': [''],
    'Track_Length': [],
    'Number_of_Turns': [],
    'Track_Speed': [''],
    'Elevation_Changes': [],
    'Surface_Grip': [''],
    'Track_Type': [''],

    #driver historic track performance
    'Driver_Wins_At_Track': [],
    'Driver_Podiums_At_Track': [],
    'Driver_Avg_Qualifying_At_Track': [],
    'Driver_Avg_Finishing_Position_At_Track': [],
    'Driver_Races_At_Track': []
})

In [None]:
#predict finishing position for the new race
predicted_position = pipeline.predict(new_race)
print(f'Predicted Finishing Position: {predicted_position[0]}')