In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
import xgboost as xgb

# Load the datasets
train = pd.read_csv("../Housing-prices-competition/final_data_nv.csv")
test = pd.read_csv("../Housing-prices-competition/final_test_data_nv.csv")
sample_submission = pd.read_csv("../Housing-prices-competition/home-data-for-ml-course/sample_submission.csv")

# Define features (X) and target (y)
y_train = train["SalePrice"]
X_train = train.drop(columns=["SalePrice"])
X_test = test.copy()

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Ensure alignment of features between training and validation
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform Randomized Search for best hyperparameters
rf_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    rf_param_grid,
    n_iter=30,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rf_search.fit(X_train, y_train)
print("Best RF Parameters:", rf_search.best_params_)

# Train best RF model
best_rf_model = rf_search.best_estimator_
best_rf_model.fit(X_train, y_train)

# Evaluate on validation set
val_predictions = best_rf_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
print("RF Validation RMSE:", rmse_val)

# Train final model on full dataset
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])
best_rf_model.fit(X_full, y_full)

# Save model
joblib.dump(best_rf_model, 'best_rf_model.pkl')

# Make test predictions
test_predictions = best_rf_model.predict(X_test)
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": test_predictions})
submission.to_csv("submission_rf.csv", index=False)

print("Submission file 'submission_rf.csv' created successfully!")

# XGBoost Hyperparameter tuning
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_search = RandomizedSearchCV(
    xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    xgb_param_grid,
    n_iter=30,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

xgb_search.fit(X_train, y_train)
print("Best XGBoost Parameters:", xgb_search.best_params_)

best_xgb_model = xgb_search.best_estimator_
best_xgb_model.fit(X_train, y_train)

val_predictions = best_xgb_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_predictions))
print("XGBoost Validation RMSE:", rmse_val)

best_xgb_model.fit(X_full, y_full)
joblib.dump(best_xgb_model, 'best_xgb_model.pkl')

test_predictions = best_xgb_model.predict(X_test)
submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": test_predictions})
submission.to_csv("submission_xgb.csv", index=False)

print("Submission file 'submission_xgb.csv' created successfully!")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best RF Parameters: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 15, 'bootstrap': True}
RF Validation RMSE: 21274.029851086365
Submission file 'submission_rf.csv' created successfully!
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best XGBoost Parameters: {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
XGBoost Validation RMSE: 18383.433522435887
Submission file 'submission_xgb.csv' created successfully!
