In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import joblib


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, r2_score, mean_absolute_error, mean_squared_error
import joblib

# Load the datasets
train = pd.read_csv("../Housing-prices-competition/final_data.csv")
test = pd.read_csv("../Housing-prices-competition/final_test_data.csv")
sample_submission = pd.read_csv("../Housing-prices-competition/home-data-for-ml-course/sample_submission.csv")

# Define Features (X) and Target (y)
y_train = train["SalePrice"]
X_train = train.drop(columns=["SalePrice"])
X_test = test.copy()

# Log transformation of target variable
y_train_log = np.log1p(y_train)

# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 300, 500, 700],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False]
}

# Perform Grid Search
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="neg_mean_squared_log_error",
    n_jobs=-1
)

grid_search.fit(X_train, y_train_log)

# Get best hyperparameters
best_params = grid_search.best_params_
print("Best Parameters Found:", best_params)

# Train model with best hyperparameters
best_rf_model = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    max_features=best_params["max_features"],
    bootstrap=best_params["bootstrap"],
    random_state=42
)

best_rf_model.fit(X_train, y_train_log)

# Evaluate model on training data
y_train_pred_log = best_rf_model.predict(X_train)
y_train_pred = np.expm1(y_train_pred_log)

train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_rmsle = np.sqrt(mean_squared_log_error(y_train, np.maximum(y_train_pred, 0)))

print("\nModel Performance on Training Set:")
print(f"Training R² Score: {train_r2:.4f}")
print(f"Training MAE: {train_mae:.2f}")
print(f"Training RMSE: {train_rmse:.2f}")
print(f"Training RMSLE: {train_rmsle:.4f}")

# Cross-validation RMSLE
cv_scores = cross_val_score(best_rf_model, X_train, y_train_log, cv=5, scoring='neg_mean_squared_log_error')
cv_rmsle = np.sqrt(-cv_scores)
print(f"Cross-Validation RMSLE: {cv_rmsle.mean():.4f}")

# Make predictions on test set
y_test_pred_log = best_rf_model.predict(X_test)
y_test_pred = np.expm1(y_test_pred_log)

# Prepare submission file
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": y_test_pred})
submission.to_csv("submission_final_billal.csv", index=False)
print("\nPredictions saved to submission_final_billal.csv")

# Save the trained model
joblib.dump(best_rf_model, 'best_model.pkl')


Best Parameters Found: {'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 100}

 Model Performance on Training Set:
 Training R² Score: 0.9760
 Training MAE: 6997.50
 Training RMSE: 12290.69
 Training RMSLE: 0.0672

 Predictions saved to submission_final_billal.csv 


['best_model.pkl']