In [49]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [50]:
data = pd.read_csv('../data/voiture_model.csv')
data

Unnamed: 0,etat_de_route,turbo,type_vehicule,roues_motrices,emplacement_moteur,marque,modele,empattement,longueur_voiture,largeur_voiture,poids_vehicule,taille_moteur,taux_alésage,chevaux,consommation_ville,consommation_autoroute,prix
0,3,standard,cabriolet,propulsion,avant,ALFA-ROMERO,GIULIA,225.0,428.8,162.8,1155.752416,130,88.1,111,11.20,8.71,13495.0
1,3,standard,cabriolet,propulsion,avant,ALFA-ROMERO,STELVIO,225.0,428.8,162.8,1155.752416,130,88.1,111,11.20,8.71,16500.0
2,1,standard,hayon,propulsion,avant,ALFA-ROMERO,QUADRIFOGLIO,240.0,434.8,166.4,1280.490216,152,68.1,154,12.38,9.05,16500.0
3,2,standard,berline,traction,avant,AUDI,100LS,253.5,448.6,168.1,1060.044504,109,81.0,102,9.80,7.84,13950.0
4,2,standard,berline,quatre_roues_motrices,avant,AUDI,100LS,252.5,448.6,168.7,1280.943808,136,81.0,115,13.07,10.69,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,-1,standard,berline,propulsion,avant,VOLVO,145E(SW),277.1,479.6,175.0,1339.003584,141,96.0,114,10.23,8.40,16845.0
199,-1,turbo,berline,propulsion,avant,VOLVO,144EA,277.1,479.6,174.8,1383.002008,141,96.0,160,12.38,9.41,19045.0
200,-1,standard,berline,propulsion,avant,VOLVO,244DL,277.1,479.6,175.0,1366.219104,173,90.9,134,13.07,10.23,21485.0
201,-1,turbo,berline,propulsion,avant,VOLVO,246,277.1,479.6,175.0,1459.205464,145,76.5,106,9.05,8.71,22470.0


In [51]:
# Préparez vos données (séparez les caractéristiques et la cible)
X = data.drop(['prix'], axis=1)
y = data['prix']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state=42)

# Identifiez les colonnes numériques et catégorielles
numerical_columns = ['empattement', 'longueur_voiture', 'largeur_voiture', 'poids_vehicule', 'taille_moteur', 'taux_alésage', 'chevaux', 'consommation_ville', 'consommation_autoroute']
categorical_columns = ['etat_de_route', 'turbo', 'type_vehicule', 'roues_motrices', 'emplacement_moteur', 'marque', 'modele']


In [52]:
# Créer un transformateur pour les variables numériques
num_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

# Créer un transformateur pour les variables catégorielles
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Créer un préprocesseur pour combiner les deux transformateurs
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_columns),
        ('cat', cat_transformer, categorical_columns)
    ])

In [53]:
ridge_params = {
    'ridge_regression__alpha': np.logspace(-4, 4, 20),
    "ridge_regression__max_iter": [100, 200, 300],
}

random_forest_params = {
    'random_forest__n_estimators': [100, 200, 300],
    'random_forest__max_depth': [None, 10, 20, 30],
    'random_forest__min_samples_split': [2, 5, 10],
    'random_forest__min_samples_leaf': [1, 2, 4],
    'random_forest__max_features': ['auto', 'sqrt', 'log2'],
}

lasso_params = {
    'lasso_regression__alpha': np.logspace(-4, 4, 20),
    'lasso_regression__max_iter': [100, 200, 300],
    'lasso_regression__tol': [0.0001, 0.001, 0.01, 0.1, 1, 10],
}


In [54]:
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge_regression', Ridge(random_state=42))
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestRegressor(random_state=42))
])

lasso_regression_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso_regression', Lasso(random_state=42))
])

In [55]:
ridge_grid = GridSearchCV(ridge_pipeline, param_grid=ridge_params, cv=5, scoring='r2')
random_forest_grid = GridSearchCV(random_forest_pipeline, param_grid=random_forest_params, cv=5, scoring='r2')
lasso_grid = GridSearchCV(lasso_regression_pipeline, param_grid=lasso_params, cv=5, scoring='r2')

In [56]:
ridge_grid.fit(X_train, y_train)
random_forest_grid.fit(X_train, y_train)
lasso_grid.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [57]:
print("Best parameters for Ridge Regression:", ridge_grid.best_params_)
print("Best score for Ridge Regression:", ridge_grid.best_score_)
print("Best parameters for Random Forest:", random_forest_grid.best_params_)
print("Best score for Random Forest:", random_forest_grid.best_score_)
print("Best parameters for Lasso Regression:", lasso_grid.best_params_)
print("Best score for Lasso Regression:", lasso_grid.best_score_)

Best parameters for Ridge Regression: {'ridge_regression__alpha': 0.615848211066026, 'ridge_regression__max_iter': 100}
Best score for Ridge Regression: 0.9041915587010028
Best parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__max_features': 'sqrt', 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 200}
Best score for Random Forest: 0.888530464000214
Best parameters for Lasso Regression: {'lasso_regression__alpha': 4.281332398719396, 'lasso_regression__max_iter': 300, 'lasso_regression__tol': 0.001}
Best score for Lasso Regression: 0.9117984806303692


In [58]:
y_pred_ridge_best = ridge_grid.predict(X_test)
y_pred_forest_best = random_forest_grid.predict(X_test)
y_pred_lasso_best = lasso_grid.predict(X_test)

rmse_forest_best = np.sqrt(mean_squared_error(y_test, y_pred_forest_best))
rmse_ridge_best = np.sqrt(mean_squared_error(y_test, y_pred_ridge_best))
rmse_lasso_best = np.sqrt(mean_squared_error(y_test, y_pred_lasso_best))

r2_forest_best = r2_score(y_test, y_pred_forest_best)
r2_ridge_best = r2_score(y_test, y_pred_ridge_best)
r2_lasso_best = r2_score(y_test, y_pred_lasso_best)

mae_forest_best = mean_absolute_error(y_test, y_pred_forest_best)
mae_ridge_best = mean_absolute_error(y_test, y_pred_ridge_best)
mae_lasso_best = mean_absolute_error(y_test, y_pred_lasso_best)

In [59]:
print("\nRandom Forest with best parameters:")
print("RMSE:", rmse_forest_best)
print("R2:", r2_forest_best)
print("MAE:", mae_forest_best)

print("\nRidge Regression with best parameters:")
print("RMSE:", rmse_ridge_best)
print("R2:", r2_ridge_best)
print("MAE:", mae_ridge_best)

print("\nLasso Regression with best parameters:")
print("RMSE:", rmse_lasso_best)
print("R2:", r2_lasso_best)
print("MAE:", mae_lasso_best)


Random Forest with best parameters:
RMSE: 2996.551095749178
R2: 0.8823431554138917
MAE: 1995.6688134165554

Ridge Regression with best parameters:
RMSE: 3202.754855955446
R2: 0.8655932070331988
MAE: 2201.1029526068282

Lasso Regression with best parameters:
RMSE: 2987.4493571481735
R2: 0.883056812854772
MAE: 2018.777073880075


In [60]:
results_df_best = pd.DataFrame({"Random Forest (Best)": y_pred_forest_best, "Ridge Regression (Best)": y_pred_ridge_best, "Lasso Regression (Best)": y_pred_lasso_best, "Actual": y_test})
results_df_best.head(10)

Unnamed: 0,Random Forest (Best),Ridge Regression (Best),Lasso Regression (Best),Actual
15,28959.80225,27024.298856,28179.410669,30760.0
9,18548.754856,20134.461692,20935.974709,17859.167
115,13954.983599,14842.646108,14760.325722,16630.0
78,6922.250923,5268.967423,5807.773885,6669.0
66,10577.366372,13390.470509,12702.35022,18344.0
45,7297.535023,6848.554884,7104.825352,8916.5
143,10774.369927,11698.649497,11361.430309,11259.0
177,17644.186563,19238.812654,18408.804732,15998.0
200,19730.488723,22081.427825,21067.932995,21485.0
180,8051.481043,8429.11208,8303.853868,7775.0
