In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import joblib


In [2]:
#load the Datasets
train = pd.read_csv("../Housing-prices-competition/final_data.csv")
test = pd.read_csv("../Housing-prices-competition/final_test_data.csv")
sample_submission = pd.read_csv("../Housing-prices-competition/home-data-for-ml-course/sample_submission.csv")

#define Features (X) and Target (y)
y_train = train["SalePrice"]  # Target variable
X_train = train.drop(columns=["SalePrice"])  # Remove target from train data

X_test = test.copy()  # Keep test dataset separate

#Define Hyperparameter Grid for GridSearchCV
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],

}

#Perform Grid Search to Find Best Hyperparameters
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

#Get Best Hyperparameters
best_params = grid_search.best_params_
print("Best Parameters Found:", best_params)

#Train Model with Best Parameters
best_rf_model = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    random_state=42
)
best_rf_model.fit(X_train, y_train)

#Evaluate Model on Training Data
y_train_pred = best_rf_model.predict(X_train)

#Compute Metrics
from sklearn.metrics import mean_squared_log_error

train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_rmsle = np.sqrt(mean_squared_log_error(y_train, np.maximum(y_train_pred, 0)))

#Display Model Performance
print("\n Model Performance on Training Set:")
print(f" Training R² Score: {train_r2:.4f}")
print(f" Training MAE: {train_mae:.2f}")
print(f" Training RMSE: {train_rmse:.2f}")
print(f" Training RMSLE: {train_rmsle:.4f}")

#Make Predictions on Test Set
y_test_pred = best_rf_model.predict(X_test)

#Prepare Submission File
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": y_test_pred})
submission.to_csv("submission_final_billal.csv", index=False)
print("\n Predictions saved to submission_final_billal.csv ")
joblib.dump(best_rf_model, 'best_model.pkl')


Best Parameters Found: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 100}

 Model Performance on Training Set:
 Training R² Score: 0.9760
 Training MAE: 6997.50
 Training RMSE: 12290.69
 Training RMSLE: 0.0672

 Predictions saved to submission_final_billal.csv 


['best_model.pkl']