## Cellule
La cellule suivante est dédiée aux imports de fonction

In [163]:
import pandas as pd
import os
from fastparquet import ParquetFile

In [164]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score


## Description de la cellule
La cellule suivant permet d'importer un fichier .csv

In [165]:
def read_csv(file):
    return pd.read_csv(file)

## Description
Importation des fichiers

In [166]:
csv_data = {}

for file in os.listdir("csv"):
    csv_data[file.split('.')[0]]=read_csv("csv/"+file)

## Description de la cellule
Importation du fichier parquet

In [167]:
dp = ParquetFile('weather.parquet')
weather = dp.to_pandas()

# ATTENTION
On va maintenant rassembler les données et construire le dataSet

### Construction principale

In [168]:
data_set = pd.DataFrame({
        "driverId": csv_data['results']['driverId'],
        "constructorId": csv_data['results']['constructorId'],
        "position": csv_data['results']['position'],
        "positionOrder": csv_data['results']['positionOrder']
})

## Ajout et adaptation

In [170]:
csv_data['constructor_results']['constructor_points_moy'] = csv_data['constructor_results'].groupby('constructorId')['points'].transform('mean')

In [171]:
data_set = data_set.merge(csv_data['constructor_results'][['constructorId','constructor_points_moy']], on='constructorId', how="outer")

In [172]:
data_set['position']=data_set['position'].apply(lambda x: 0 if isinstance(x, str) else x)
data_set['driverId'].astype('str')

0            1
1            1
2            1
3            1
4            1
          ... 
9721552    817
9721553    817
9721554    817
9721555    817
9721556    817
Name: driverId, Length: 9721557, dtype: object

In [173]:
data_set.drop_duplicates(inplace=True)

### On retire les pilotes qui ne sont plus en activité

In [174]:
active_driver_ids = [1, 3, 4, 5, 6, 7, 10, 14, 16, 18, 20, 22, 23, 24, 27, 31, 44, 55, 63, 77]
data_set = data_set[data_set['driverId'].isin(active_driver_ids)]

## On mets tous les identifiants en tant que chaîne de caractère

In [175]:
data_set['driverId'].astype('str')
data_set['constructorId'].astype('str')

0            1
917          1
1834         1
2751         1
4585         1
          ... 
9711519    214
9711597    214
9712065    214
9713937    214
9715887    214
Name: constructorId, Length: 1129, dtype: object

## On encode les valeurs str

In [176]:
y_bis = data_set.loc[:,:"position"]
y_bis.drop(["driverId", "constructorId"], axis=1, inplace=True)

In [177]:
target_encoder = TargetEncoder()
data_set['driverId_encoded'] = target_encoder.fit_transform(data_set['driverId'], y_bis['position'])
data_set['constructorId_encoded'] = target_encoder.fit_transform(data_set['constructorId'], y_bis['position'])



## On défini x et y

In [178]:
X = data_set[['driverId_encoded', 'constructorId_encoded', 'position', 'positionOrder', 'constructor_points_moy']]
y = data_set['position']

# Division


In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [180]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [181]:
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [182]:
accuracy_score_test = accuracy_score(y_test, y_test_pred)
accuracy_score_train = accuracy_score(y_train, y_train_pred)

In [183]:
print(accuracy_score_train)
print(accuracy_score_test)

1.0
1.0
