In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load the Datasets
train = pd.read_csv("final_data.csv")
test = pd.read_csv("final_test_data.csv")
sample_submission = pd.read_csv("home-data-for-ml-course\sample_submission.csv")

#define Features (X) and Target (y)
y_train = train["SalePrice"]  # Target variable
X_train = train.drop(columns=["SalePrice"])  # Remove target from train data

X_test = test.copy()  # Keep test dataset separate

#Define Hyperparameter Grid for GridSearchCV
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10]
}

#Perform Grid Search to Find Best Hyperparameters
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

#Get Best Hyperparameters
best_params = grid_search.best_params_
print("Best Parameters Found:", best_params)

#Train Model with Best Parameters
best_rf_model = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    random_state=42
)
best_rf_model.fit(X_train, y_train)

#Evaluate Model on Training Data
y_train_pred = best_rf_model.predict(X_train)

#Compute Metrics
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

#Display Model Performance
print("\n Model Performance on Training Set:")
print(f" Training R² Score: {train_r2:.4f}")
print(f" Training MAE: {train_mae:.2f}")
print(f" Training RMSE: {train_rmse:.2f}")

#Make Predictions on Test Set
y_test_pred = best_rf_model.predict(X_test)

#Prepare Submission File
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": y_test_pred})
submission.to_csv("submission_final_data_bilel.csv", index=False)
print("\n Predictions saved to submission_dhia_notSkewed_PCA.csv ")

✅ Best Parameters Found: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 300}

📊 Model Performance on Training Set:
📈 Training R² Score: 0.9711
📉 Training MAE: 8348.64
📉 Training RMSE: 13501.78

📁 Predictions saved to submission_dhia_notSkewed_PCA.csv ✅


