In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.metrics import mean_squared_log_error
from scipy.stats import randint, uniform

In [2]:
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error
from scipy.stats import randint

# Load the datasets
train = pd.read_csv("../bilel/final_data_nv.csv")
test = pd.read_csv("../bilel/final_test_data_nv.csv")
sample_submission = pd.read_csv("../home-data-for-ml-course/sample_submission.csv")

# Define Features (X) and Target (y)
y_train = train["SalePrice"]  # Target variable
X_train = train.drop(columns=["SalePrice"])  # Remove target from train data

X_test = test.copy()  # Keep test dataset separate

# Define Hyperparameter Grid for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 2000),        # Number of trees
    'max_depth': randint(1, 20),               # Maximum tree depth
    'min_samples_split': randint(2, 20),       # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 10),        # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None],    # Number of features to consider at each split
    'criterion': ['squared_error', 'absolute_error']  # For regression
}

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(bootstrap=True, oob_score=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=2,
    return_train_score=True,
    cv=3  # Using cross-validation of 3 folds instead of dummy cv
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_
best_rmse = -random_search.best_score_
print("Best Parameters Found:", best_params)
print("Best RMSE on validation set: {:.4f}".format(best_rmse))

# Train a final RandomForestRegressor using the best parameters
final_model = RandomForestRegressor(**best_params, bootstrap=True, oob_score=True, random_state=42)
final_model.fit(X_train, y_train)

# Evaluate Model on Training Data
y_train_pred = final_model.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_rmsle = np.sqrt(mean_squared_log_error(y_train, np.maximum(y_train_pred, 0)))

# Display Model Performance
print("\n Model Performance on Training Set:")
print(f" Training R² Score: {train_r2:.4f}")
print(f" Training MAE: {train_mae:.2f}")
print(f" Training RMSE: {train_rmse:.2f}")
print(f" Training RMSLE: {train_rmsle:.4f}")
# Remplir les NaN avec la médiane pour les colonnes numériques

X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

# Make Predictions on Test Set
y_test_pred = final_model.predict(X_test)

# Prepare Submission File
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": y_test_pred})
submission.to_csv("submission_final_nouveau.csv", index=False)
print("\n Predictions saved to submission_final_billal.csv ")

# Save the trained model
joblib.dump(final_model, 'best_model.pkl')
print("Model saved as best_model.pkl")



Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END criterion=squared_error, max_depth=15, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   2.3s
[CV] END criterion=squared_error, max_depth=15, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   2.3s
[CV] END criterion=squared_error, max_depth=15, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=221; total time=   2.3s
[CV] END criterion=absolute_error, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=3, n_estimators=443; total time=   4.0s
[CV] END criterion=absolute_error, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=3, n_estimators=443; total time=   3.9s
[CV] END criterion=absolute_error, max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=3, n_estimators=443; total time=   3.8s
[CV] END criterion=absolute_error, max_depth=6, max_features=log2, m