In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

In [2]:
#Charger les donn√©es et le scaler
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/processed/y_test.csv").values.ravel()
scaler = joblib.load("../data/processed/scaler.pkl")

In [3]:
#D√©finir les mod√®les et leurs grilles d'hyperparam√®tres
param_grid = {
    "Linear Regression": {},  # Pas d'hyperparam√®tres
    "Ridge": {"alpha": [0.1, 1, 10, 50, 100]},
    "Lasso": {"alpha": [0.001, 0.01, 0.1, 1, 10]},
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    }
}

In [4]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42)
}


In [5]:
#GridSearchCV sur chaque mod√®le

results = {}

for name, model in models.items():
    print(f"\nüîπ GridSearch pour {name} ...")
    
    grid = GridSearchCV(
        model,
        param_grid[name],
        cv=5,
        scoring="r2",
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    # üìå 4Ô∏è‚É£ √âvaluation sur le jeu de test
    y_pred = best_model.predict(X_test)
    
    results[name] = {
        "Best Params": grid.best_params_,
        "R2_test": r2_score(y_test, y_pred),
        "RMSE_test": mean_squared_error(y_test, y_pred, squared=False),
        "MAE_test": mean_absolute_error(y_test, y_pred)
    }
    
    #Sauvegarder le mod√®le
    joblib.dump(best_model, f"../models/{name.replace(' ', '_').lower()}_best.pkl")


üîπ GridSearch pour Linear Regression ...





üîπ GridSearch pour Ridge ...





üîπ GridSearch pour Lasso ...

üîπ GridSearch pour Random Forest ...




In [6]:
#R√©sum√© des r√©sultats
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="R2_test", ascending=False)
print("\nüìä R√©sultats finaux sur le jeu de test :\n")
print(results_df)

results_df.to_csv("../data/processed/final_model_results.csv", index=True)
print("\n‚úÖ Mod√®les et r√©sultats sauvegard√©s dans ../data/processed/")


üìä R√©sultats finaux sur le jeu de test :

                                                         Best Params  \
Ridge                                                 {'alpha': 100}   
Lasso                                                  {'alpha': 10}   
Linear Regression                                                 {}   
Random Forest      {'max_depth': None, 'min_samples_split': 5, 'n...   

                    R2_test      RMSE_test       MAE_test  
Ridge              0.072371  897650.975774   156387.68502  
Lasso              0.071531  898057.137665    157737.1169  
Linear Regression   0.07153  898057.596809  157741.677229  
Random Forest      0.069757  898915.067088   147967.60848  

‚úÖ Mod√®les et r√©sultats sauvegard√©s dans ../data/processed/
