In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [34]:
# Load the datasets
train = pd.read_csv("train_simp.csv")
test = pd.read_csv("test_simp.csv")
sample_submission = pd.read_csv("home-data-for-ml-course/sample_submission.csv")

# Define Features (X) and Target (y)
y_train = train["SalePrice"]  # Target variable
X_train = train.drop(columns=["SalePrice"])  # Remove target from train data
X_test = test.copy()  # Test dataset
y_test = sample_submission["SalePrice"]  # Ground Truth for Test Set


In [35]:
def train_and_evaluate(model, param_grid, model_name):
    print(f"\nüöÄ Training {model_name}...\n")
    
    # Perform Grid Search with Cross-Validation
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,  # 5-fold cross-validation
        scoring="r2",
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)

    # Get best hyperparameters
    best_params = grid_search.best_params_
    print(f"‚úÖ Best Parameters for {model_name}: {best_params}")

    # Train Model with Best Parameters
    best_model = model.set_params(**best_params)
    best_model.fit(X_train, y_train)

    # Evaluate Model on Training Data
    y_train_pred = best_model.predict(X_train)
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

    print("\nüìä Model Performance on Training Set:")
    print(f"üìà Training R¬≤ Score: {train_r2:.4f}")
    print(f"üìâ Training MAE: {train_mae:.2f}")
    print(f"üìâ Training RMSE: {train_rmse:.2f}")

    # Evaluate Model on Test Set
    y_test_pred = best_model.predict(X_test)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

    print("\nüìä Model Performance on Test Set:")
    print(f"üìà Test R¬≤ Score: {test_r2:.4f}")
    print(f"üìâ Test MAE: {test_mae:.2f}")
    print(f"üìâ Test RMSE: {test_rmse:.2f}")

    # Save Predictions
    submission = pd.DataFrame({"Id": sample_submission["Id"], "SalePrice": y_test_pred})
    submission.to_csv(f"{model_name}simplified.csv", index=False)
    print(f"üìÅ Predictions saved to {model_name}simplified.csv\n")


In [36]:
xgb_params = {
    "n_estimators": [100, 300],
    "max_depth": [5, 10],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.7, 1],
    "colsample_bytree": [0.7, 1]
}

train_and_evaluate(XGBRegressor(random_state=42), xgb_params, "XGBoost")



üöÄ Training XGBoost...

‚úÖ Best Parameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}





üìä Model Performance on Training Set:
üìà Training R¬≤ Score: 0.9850
üìâ Training MAE: 7231.22
üìâ Training RMSE: 9738.18

üìä Model Performance on Test Set:
üìà Test R¬≤ Score: -10.8815
üìâ Test MAE: 46420.78
üìâ Test RMSE: 56918.29
üìÅ Predictions saved to XGBoostsimplified.csv





In [37]:
lgbm_params = {
    "n_estimators": [100, 300],
    "max_depth": [5, 10],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.7, 1],
    "colsample_bytree": [0.7, 1]
}

train_and_evaluate(LGBMRegressor(random_state=42), lgbm_params, "LightGBM")



üöÄ Training LightGBM...

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2517
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 109
[LightGBM] [Info] Start training from score 180932.919067
‚úÖ Best Parameters for LightGBM: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.7}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2517
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 109
[LightGBM] [Info] Start training from score 180932.919067

üìä Model Performance on Training Set:
üìà Training R¬≤ Score: 0.9843
üìâ



In [38]:
# catboost_params = {
#     "iterations": [100, 300],
#     "depth": [6, 10],
#     "learning_rate": [0.01, 0.1],
#     "l2_leaf_reg": [3, 5]
# }

# train_and_evaluate(CatBoostRegressor(random_state=42, verbose=0), catboost_params, "CatBoost")
