## Cellule
La cellule suivante est dédiée aux imports de fonction

In [15]:
import pandas as pd
import os
from fastparquet import ParquetFile

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score


## Description de la cellule
La cellule suivant permet d'importer un fichier .csv

In [17]:
def read_csv(file):
    return pd.read_csv(file)

## Description
Importation des fichiers

In [18]:
csv_data = {}

for file in os.listdir("csv"):
    csv_data[file.split('.')[0]]=read_csv("csv/"+file)

## Description de la cellule
Importation du fichier parquet

In [19]:
dp = ParquetFile('weather.parquet')
weather = dp.to_pandas()

# ATTENTION
On va maintenant rassembler les données et construire le dataSet

### Construction principale

In [20]:
data_set = pd.DataFrame({
        "driverId": csv_data['results']['driverId'],
        "constructorId": csv_data['results']['constructorId'],
        "position": csv_data['results']['position'],
        "positionOrder": csv_data['results']['positionOrder'],
        "circuitId": csv_data['circuits']['circuitId']
        "circuitName": csv_data['circuits']['name'],
        "circuitsLocation": csv_data['circuits']['location'],
        "lat": csv_data['circuits']['lat'],
        "lng": csv_data['circuits']['lng']
})

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1166327473.py, line 6)

## Ajout et adaptation

In [7]:
csv_data['constructor_results']['constructor_points_moy'] = csv_data['constructor_results'].groupby('constructorId')['points'].transform('mean')

In [8]:
data_set = data_set.merge(csv_data['constructor_results'][['constructorId', 'raceId', 'constructor_points_moy']], on='constructorId', how="outer")
data_set = data_set.merge(csv_data['constructor_standings'][['raceId', 'constructorId', 'wins']], on=['raceId', 'constructorId'], how="outer")
data_set = data_set.merge(csv_data['constructors'][['constructorId', 'name', 'nationality']], on=['constructorId'], how="outer")
data_set = data_set.merge(csv_data['drivers'][['driverId', 'surname']], on=['driverId'], how="outer")
data_set = data_set.merge(csv_data['driver_standings'][['driverStandingsId', 'raceId', 'driverId', 'points']], on=['raceId', 'driverId'], how="outer")
data_set = data_set.merge(csv_data['lap_times'][['raceId', 'driverId', 'time']], on=['raceId', 'driverId'], how="outer")
data_set = data_set.merge(csv_data['pit_stops'][['raceId', 'driverId', 'stop', 'lap']], on=['raceId', 'driverId'], how="outer")
data_set = data_set.merge(csv_data['qualifying'][['qualifyId', 'raceId', 'driverId']], on=['raceId', 'driverId'], how="outer")
data_set = data_set.merge(csv_data['races'][['raceId', 'circuitId', 'name']], on=['raceId', 'circuitId'], how="outer")

KeyError: 'circuitId'

In [9]:
data_set['position']=data_set['position'].apply(lambda x: 0 if isinstance(x, str) else x)
data_set['driverId'].astype('str')

0              1.0
1              1.0
2              1.0
3              1.0
4              1.0
             ...  
135163767    798.0
135163768    801.0
135163769    803.0
135163770    805.0
135163771      nan
Name: driverId, Length: 135163772, dtype: object

In [10]:
data_set.drop_duplicates(inplace=True)

### On retire les pilotes qui ne sont plus en activité

In [11]:
active_driver_ids = [1, 3, 4, 5, 6, 7, 10, 14, 16, 18, 20, 22, 23, 24, 27, 31, 44, 55, 63, 77]
data_set = data_set[data_set['driverId'].isin(active_driver_ids)]

## On mets tous les identifiants en tant que chaîne de caractère

In [12]:
data_set['driverId'].astype('str')
data_set['constructorId'].astype('str')


0            1.0
1            1.0
2            1.0
3            1.0
4            1.0
            ... 
135155994    6.0
135155996    6.0
135156002    6.0
135156018    6.0
135156023    6.0
Name: constructorId, Length: 11153544, dtype: object

## On encode les valeurs str

In [13]:
y_bis = data_set.loc[:,:"position"]
y_bis.drop(["driverId", "constructorId"], axis=1, inplace=True)

In [14]:
target_encoder = TargetEncoder()
data_set['driverId_encoded'] = target_encoder.fit_transform(data_set['driverId'], y_bis['position'])
data_set['constructorId_encoded'] = target_encoder.fit_transform(data_set['constructorId'], y_bis['position'])



## On défini x et y

In [15]:
X = data_set[['driverId_encoded', 'constructorId_encoded', 'position', 'positionOrder', 'constructor_points_moy']]
y = data_set['position']

# Division


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [18]:
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [19]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_train = accuracy_score(y_train, y_train_pred)

In [20]:
print(accuracy_score_train)
print(accuracy_score_test)

1.0
1.0
