## Cellule
La cellule suivante est dédiée aux imports de fonction

In [1]:
import pandas as pd
import os
from fastparquet import ParquetFile

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score


## Description de la cellule
La cellule suivant permet d'importer un fichier .csv

In [3]:
def read_csv(file):
    return pd.read_csv(file)

## Description
Importation des fichiers

In [4]:
csv_data = {}

for file in os.listdir("csv"):
    csv_data[file.split('.')[0]]=read_csv("csv/"+file)

## Description de la cellule
Importation du fichier parquet

In [5]:
dp = ParquetFile('weather.parquet')
weather = dp.to_pandas()

# ATTENTION
On va maintenant rassembler les données et construire le dataSet

### Construction principale

In [6]:
data_set = pd.DataFrame({
        "driverId": csv_data['results']['driverId'],
        "constructorId": csv_data['results']['constructorId'],
        "position": csv_data['results']['position'],
        "positionOrder": csv_data['results']['positionOrder'],
        "circuitId": csv_data['circuits']['circuitId'],
        "circuitName": csv_data['circuits']['name'],
        "circuitsLocation": csv_data['circuits']['location'],
        "lat": csv_data['circuits']['lat'],
        "lng": csv_data['circuits']['lng'],
        "raceId": csv_data['races']['raceId']
})

## Ajout et adaptation

In [7]:
csv_data['constructor_results']['constructor_points_moy'] = csv_data['constructor_results'].groupby('constructorId')['points'].transform('mean')

In [8]:
csv_data['circuits'].drop(columns=['circuitRef', 'location', 'country', 'alt', 'url'], axis=1, inplace=True)
csv_data['constructors'].drop(columns=['constructorRef', 'nationality', 'url'], axis=1, inplace=True)
csv_data['constructor_results'].drop(columns=['points', 'status'], axis=1, inplace=True)
csv_data['constructor_standings'].drop(columns=['points', 'positionText',  'wins'], axis=1, inplace=True)
csv_data['drivers'].drop(columns=['driverRef', 'number', 'nationality', 'dob', 'forename', 'code', 'url'], axis=1, inplace=True)
csv_data['driver_standings'].drop(columns=['positionText', 'points', 'wins'], axis=1, inplace=True)
csv_data['lap_times'].drop(columns=['time', 'milliseconds'], axis=1, inplace=True)
csv_data['pit_stops'].drop(columns=['time', 'duration', 'milliseconds'], axis=1, inplace=True)
csv_data['qualifying'].drop(columns=['number', 'position', 'q1', 'q2', 'q3'], axis=1, inplace=True)
csv_data['races'].drop(columns=['date', 'time', 'url', 'fp1_date','fp1_time','fp2_date','fp2_time','fp3_date','fp3_time','quali_date','quali_time','sprint_date','sprint_time'], axis=1, inplace=True)

In [9]:
pd.merge(data_set, csv_data['circuits'], on='circuitId', how='inner')
pd.merge(data_set, csv_data['constructors'], on='constructorId', how='inner')
pd.merge(data_set, csv_data['constructor_results'], on='constructorId', how='inner')
pd.merge(data_set, csv_data['constructor_standings'], on='constructorId', how='inner')
pd.merge(data_set, csv_data['drivers'], on='driverId', how='inner')
pd.merge(data_set, csv_data['driver_standings'], on='driverId', how='inner')
pd.merge(data_set, csv_data['lap_times'], on='driverId', how='inner')
pd.merge(data_set, csv_data['pit_stops'], on='driverId', how='inner')
pd.merge(data_set, csv_data['qualifying'], on='driverId', how='inner')
pd.merge(data_set, csv_data['races'], on='raceId', how='inner')

Unnamed: 0,driverId,constructorId,position,positionOrder,circuitId_x,circuitName,circuitsLocation,lat,lng,raceId,year,round,circuitId_y,name
0,1,1,1,1,1.0,Albert Park Grand Prix Circuit,Melbourne,-37.84970,144.96800,1.0,2009,1,1,Australian Grand Prix
1,2,2,2,2,2.0,Sepang International Circuit,Kuala Lumpur,2.76083,101.73800,2.0,2009,2,2,Malaysian Grand Prix
2,3,3,3,3,3.0,Bahrain International Circuit,Sakhir,26.03250,50.51060,3.0,2009,3,17,Chinese Grand Prix
3,4,4,4,4,4.0,Circuit de Barcelona-Catalunya,Montmeló,41.57000,2.26111,4.0,2009,4,3,Bahrain Grand Prix
4,5,1,5,5,5.0,Istanbul Park,Istanbul,40.95170,29.40500,5.0,2009,5,4,Spanish Grand Prix
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,8,1,5,5,,,,,,1140.0,2024,20,32,Mexico City Grand Prix
1121,21,4,6,6,,,,,,1141.0,2024,21,18,São Paulo Grand Prix
1122,22,11,7,7,,,,,,1142.0,2024,22,80,Las Vegas Grand Prix
1123,37,1,8,8,,,,,,1143.0,2024,23,78,Qatar Grand Prix


In [10]:
data_set.drop_duplicates(inplace=True)

### On retire les pilotes qui ne sont plus en activité

In [11]:
active_driver_ids = [1, 3, 4, 5, 6, 7, 10, 14, 16, 18, 20, 22, 23, 24, 27, 31, 44, 55, 63, 77]
data_set = data_set[data_set['driverId'].isin(active_driver_ids)]

On converti les valeurs qui nécessite de l'être

In [12]:
data_set['position'] = data_set['position'].apply(lambda x: 0 if pd.isna(x) or isinstance(x, str) else x)

## On mets tous les identifiants en tant que chaîne de caractère

In [13]:
data_set['driverId'] = data_set['driverId'].astype('str')
data_set['constructorId'] = data_set['constructorId'].astype('str')

## On encode les valeurs str

In [14]:
data_set

Unnamed: 0,driverId,constructorId,position,positionOrder,circuitId,circuitName,circuitsLocation,lat,lng,raceId
0,1,1,0,1,1.0,Albert Park Grand Prix Circuit,Melbourne,-37.8497,144.96800,1.0
2,3,3,0,3,3.0,Bahrain International Circuit,Sakhir,26.0325,50.51060,3.0
3,4,4,0,4,4.0,Circuit de Barcelona-Catalunya,Montmeló,41.5700,2.26111,4.0
4,5,1,0,5,5.0,Istanbul Park,Istanbul,40.9517,29.40500,5.0
5,6,3,0,6,6.0,Circuit de Monaco,Monte-Carlo,43.7347,7.42056,6.0
...,...,...,...,...,...,...,...,...,...,...
26217,4,117,0,18,,,,,,
26417,4,117,0,19,,,,,,
26429,4,117,0,11,,,,,,
26470,4,117,0,12,,,,,,


In [15]:
y_bis = data_set.loc[:,:"position"]
y_bis.drop(["driverId", "constructorId"], axis=1, inplace=True)

In [16]:
target_encoder = TargetEncoder()
data_set['driverId_encoded'] = target_encoder.fit_transform(data_set['driverId'], y_bis['position'])
data_set['constructorId_encoded'] = target_encoder.fit_transform(data_set['constructorId'], y_bis['position'])

## On défini x et y

In [17]:
X = data_set[['driverId_encoded', 'constructorId_encoded', 'position', 'positionOrder']]
y = data_set['position']

# Division


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [20]:
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [21]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_train = accuracy_score(y_train, y_train_pred)

In [22]:
print(accuracy_score_train)
print(accuracy_score_test)

1.0
1.0


In [24]:
x_test

NameError: name 'x_test' is not defined