In [51]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt

og_data = pd.read_csv('f1_data_og.csv')
cleaned_df = pd.read_csv('cleaned.csv')
cleaned_df


Unnamed: 0.1,Unnamed: 0,season,round,GP_circuit,grid,team_name,driver,driver_age,driver_wins,constructor_wins,pos_delta,driver_dnf_ratio,position
0,0,2009,1,Albert Park Grand Prix Circuit,1,Brawn,Jenson Button,29.0,1,1,0,0.072797,1
1,1,2009,1,Albert Park Grand Prix Circuit,2,Brawn,Rubens Barrichello,37.0,1,0,0,0.089623,2
2,2,2007,1,Albert Park Grand Prix Circuit,14,Honda,Jenson Button,27.0,0,0,0,0.072797,14
3,3,2007,1,Albert Park Grand Prix Circuit,16,Honda,Rubens Barrichello,35.0,0,0,-1,0.089623,17
4,4,2006,3,Albert Park Grand Prix Circuit,1,Honda,Jenson Button,26.0,0,0,0,0.072797,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9557,9557,2021,20,Losail International Circuit,18,Alfa Romeo,Antonio Giovinazzi,28.0,0,0,0,0.081967,18
9558,9558,2021,20,Losail International Circuit,19,Haas F1 Team,Mick Schumacher,23.0,0,0,0,0.069767,19
9559,9559,2021,20,Losail International Circuit,20,Haas F1 Team,Nikita Mazepin,23.0,0,0,0,0.136364,20
9560,9560,2021,20,Losail International Circuit,2,AlphaTauri,Pierre Gasly,26.0,0,0,-2,0.084034,4


In [63]:
#2023 Tracks for testing in our function that we will define soon
circuit_list = cleaned_df[cleaned_df.season == 2023].GP_circuit.unique()
list(circuit_list)


['Albert Park Grand Prix Circuit',
 'Bahrain International Circuit',
 'Circuit de Barcelona-Catalunya',
 'Circuit de Monaco',
 'Silverstone Circuit',
 'Hungaroring',
 'Circuit de Spa-Francorchamps',
 'Circuit Gilles Villeneuve',
 'Red Bull Ring',
 'Baku City Circuit',
 'Jeddah Corniche Circuit',
 'Miami International Autodrome']

## Cleaned Data Frame Dictionary

|Feature          |Definition                                             |
|:---------------:|:------------------------------------------------------|
|Season (int)           |The year in which the race takes place                |
|Round (int)            |The specific round in the season |    
|GP_circuit (str) | The circuit at which the race occurs|
|grid (int) | Starting position of each driver|
|team_name (str) | Constructor Name|
|driver (str) | The name of each driver|
|driver_age (int) | The age of each driver|
|driver_wins (int) | The number of wins a driver has in that season at time of race|
|constructor_wins (int) | The number of wins a team has at a specific point in the season|
|pos_delta (int) | How many positions gained or lost in a race|
|driver_dnf_ratio (float) | The ratio of unfinished races to total races entered|
|position (int) | Target Var - Place of finish for each driver|



In [64]:
cleaned_df = cleaned_df.rename(columns = {
    'driver_wins':'team_wins',
    'constructor_wins': 'driver_wins'
})

In [65]:
#V10 Engine Era is the reason for sectioning of more data to the year 2000 and onwards
cleaned_df = cleaned_df[cleaned_df.season >= 2000]

# Data Pre-Processing

## One-Hot encoding 

In [66]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

# Step 1: Encode Categorical Features

# one-hot encoding for 'GP_circuit' and 'team_name':
cleaned_df = pd.get_dummies(cleaned_df, columns=['GP_circuit', 'team_name'])


## Data Splitting

In [67]:
#Train on data from 2000 - 2018
train_data = cleaned_df[(cleaned_df['season'] >= 2000) & (cleaned_df['season'] < 2019)]

#Validate on data from 2019-2022
validation_data = cleaned_df[(cleaned_df['season'] >= 2019) & (cleaned_df['season'] <= 2022)]

#Test on 2023 grand prix using a function we will define after model selection
test_data = cleaned_df[(cleaned_df['season'] == 2023)]