In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
#Charger les données
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()

### Test sur la regression lineaire, Ridge, Lasso et Random forest

In [3]:
#Définir les modèles
models = {
    "Linear Regression": Pipeline([
        ("scaler", StandardScaler()), 
        ("model", LinearRegression())
    ]),
    "Ridge": Pipeline([
        ("scaler", StandardScaler()), 
        ("model", Ridge(alpha=10))
    ]),
    "Lasso": Pipeline([
        ("scaler", StandardScaler()), 
        ("model", Lasso(alpha=0.01))
    ]),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42)
}

In [4]:
#Définir la validation croisée
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
#Fonction pour évaluer les modèles
def evaluate_model(model, X, y):
    r2 = cross_val_score(model, X, y, cv=kf, scoring="r2")
    rmse = -cross_val_score(model, X, y, cv=kf, scoring="neg_root_mean_squared_error")
    mae = -cross_val_score(model, X, y, cv=kf, scoring="neg_mean_absolute_error")
    
    return {
        "R2_mean": r2.mean(), "R2_std": r2.std(),
        "RMSE_mean": rmse.mean(), "RMSE_std": rmse.std(),
        "MAE_mean": mae.mean(), "MAE_std": mae.std()
    }

In [6]:
#Évaluer tous les modèles
results = {}
for name, model in models.items():
    print(f"🔹 Entraînement : {name}")
    results[name] = evaluate_model(model, X_train, y_train)

🔹 Entraînement : Linear Regression
🔹 Entraînement : Ridge
🔹 Entraînement : Lasso
🔹 Entraînement : Random Forest


In [7]:
#Résultats dans un DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="R2_mean", ascending=False)

print("\n📊 Résultats Cross-Validation (5-folds) :\n")
print(results_df)



📊 Résultats Cross-Validation (5-folds) :

                    R2_mean    R2_std      RMSE_mean       RMSE_std  \
Ridge              0.510863  0.169782  291404.317979  106640.062685   
Lasso              0.510607  0.169972  291436.597195  106594.381618   
Linear Regression  0.510607  0.169972  291436.597502  106594.381070   
Random Forest      0.427884  0.165840  314236.284497  101799.074947   

                        MAE_mean       MAE_std  
Ridge              133837.432481   7592.989273  
Lasso              134009.451230   7557.639364  
Linear Regression  134009.455117   7557.640455  
Random Forest      126458.003588  10358.117145  


In [8]:
#Sauvegarder les résultats
results_df.to_csv("../data/processed/model_selection_results.csv", index=True)
print("\n✅ Résultats sauvegardés dans ../data/processed/model_selection_results.csv")


results_df.head()


✅ Résultats sauvegardés dans ../data/processed/model_selection_results.csv


Unnamed: 0,R2_mean,R2_std,RMSE_mean,RMSE_std,MAE_mean,MAE_std
Ridge,0.510863,0.169782,291404.317979,106640.062685,133837.432481,7592.989273
Lasso,0.510607,0.169972,291436.597195,106594.381618,134009.45123,7557.639364
Linear Regression,0.510607,0.169972,291436.597502,106594.38107,134009.455117,7557.640455
Random Forest,0.427884,0.16584,314236.284497,101799.074947,126458.003588,10358.117145
