<a href="https://colab.research.google.com/github/vengottip/Work/blob/main/XGBoostRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
def remove_features(X, features_to_remove):
    """
    Removes specified features from the DataFrame.

    Args:
    - X (pd.DataFrame): The original DataFrame.
    - features_to_remove (list): List of column names to remove from X.

    Returns:
    - pd.DataFrame: DataFrame after removing specified features.
    """
    return X.drop(columns=features_to_remove, errors='ignore')


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

# Loop through each fold (from fold1 to fold10)
for fold_num in range(1, 11):
    print(f"Processing fold{fold_num}...")
    np.random.seed(1390)
    # Set the paths for the current fold
    folder = f'fold{fold_num}'
    train_file_path = os.path.join(folder, 'train.csv')
    test_file_path = os.path.join(folder, 'test.csv')
    test_y_file_path = os.path.join(folder, 'test_y.csv')

    # Load the train, test, and test_y datasets
    housing_data_train = pd.read_csv(train_file_path)
    housing_data_test = pd.read_csv(test_file_path)
    test_y_data = pd.read_csv(test_y_file_path)  # Load the actual Sale Price for the test set

    # EXperiment starts
    # Define the list of features to remove (this is just an example, adjust as needed)
    features_to_remove = ["Longitude", "Latitude", "Street", "Utilities", "Condition_2",
                      "Roof_Matl", "Heating", "Pool_QC", "Misc_Feature", "Misc_Val",
                        "Low_Qual_Fin_SF", "Pool_Area"]  # Specify the features you want to remove
    # Call the remove_features function to remove unwanted features from X_trn
    housing_data_train = remove_features(housing_data_train, features_to_remove)

    # Create X_train by removing the SalePrice column from housing_data_train
    X_train = housing_data_train.drop('Sale_Price', axis=1)

    # Create y_train by selecting only the SalePrice column from housing_data_train
    y_train = np.log(housing_data_train['Sale_Price'])
    #print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

    # Create X_test from the housing_data_test
    X_test = housing_data_test

    # Create y_test by selecting only the SalePrice column from the test_y_data
    y_test = test_y_data['Sale_Price']
    #print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Apply one-hot encoding to both X_train and X_test
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)

    # Align the columns of X_train and X_test so they have the same structure
    X_train, X_test = X_train.align(X_test, join='left', axis=1)

    # Fill any missing columns in X_test (which may occur due to one-hot encoding) with 0
    X_test = X_test.fillna(0)

    #print(f"Aligned X_train shape: {X_train.shape}, Aligned X_test shape: {X_test.shape}")

    # Train a RandomForestRegressor
    #rfModel = RandomForestRegressor(n_estimators=400, oob_score=True, max_features=1.0/3)
    #rfModel.fit(X_train, y_train)

    # Predict on the test set
    #yhat_test_rf = rfModel.predict(X_test)

    # Calculate Test RMSE for Random Forest
    #test_rmse_rf = np.sqrt(mean_squared_error(np.log(y_test), yhat_test_rf))
    #print(f"Fold{fold_num} - Random Forest Test RMSE: {test_rmse_rf}")

    # Calculate Train RMSE for Random Forest (using the out-of-bag predictions)
    #yhat_train_oob_rf = rfModel.oob_prediction_
    #train_rmse_rf = np.sqrt(mean_squared_error(y_train, yhat_train_oob_rf))
    #print(f"Fold{fold_num} - Random Forest Train RMSE (OOB): {train_rmse_rf}")

    """
    # Define the model
    xgbModel = XGBRegressor(random_state=42)

    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],  # Number of trees
        'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
        'max_depth': [3, 4, 5, 6],  # Maximum depth of trees
        'subsample': [0.6, 0.8, 1.0],  # Fraction of samples to train on
        'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used per tree
        'gamma': [0, 0.1, 0.2],  # Minimum loss reduction
        'reg_alpha': [0, 0.01, 0.1],  # L1 regularization
        'reg_lambda': [1, 0.1, 0.01]  # L2 regularization
    }

    # Randomized search with cross-validation
    random_search = RandomizedSearchCV(xgbModel, param_distributions=param_grid,
                                      n_iter=50, scoring='neg_root_mean_squared_error',
                                      cv=5, verbose=1, random_state=42, n_jobs=-1)

    # Fit the random search model
    random_search.fit(X_train, y_train)

    # Best hyperparameters and RMSE
    print("Best parameters found: ", random_search.best_params_)
    print("Best RMSE score: ", -random_search.best_score_)
    """
    # Train an XGBRegressor
    #xgbModel = XGBRegressor(n_estimators=400, learning_rate=0.05, max_depth=3, random_state=42)
    # Replace with best parameters from RandomizedSearchCV
    xgbModel = XGBRegressor(n_estimators=400,
                            learning_rate=0.05,
                            max_depth=4,
                            subsample=0.8,
                            colsample_bytree=0.6,
                            reg_lambda=0.1,
                            reg_alpha=0.01,
                            gamma=0,
                            random_state=42)
    # Accessing best parameters directly from random_search.best_params_
    #best_params = random_search.best_params_

    # Train the XGBRegressor with the best parameters found from RandomizedSearchCV
    #xgbModel = XGBRegressor(**best_params, random_state=42)

    xgbModel.fit(X_train, y_train)

    # Predict on the training set (in-sample predictions)
    yhat_train_xgb = xgbModel.predict(X_train)

    # Calculate Train RMSE for XGBoost
    train_rmse_xgb = np.sqrt(mean_squared_error(y_train, yhat_train_xgb))
    print(f"Fold{fold_num} - XGBoost Train RMSE: {train_rmse_xgb}")

    # Predict on the test set using XGBoost
    yhat_test_xgb = xgbModel.predict(X_test)

    # Calculate Test RMSE for XGBoost
    test_rmse_xgb = np.sqrt(mean_squared_error(np.log(y_test), yhat_test_xgb))
    print(f"Fold{fold_num} - XGBoost Test RMSE: {test_rmse_xgb}")

    #print(f"Finished processing fold{fold_num}.\n")

print("Completed processing all folds.")


Processing fold1...
Fold1 - XGBoost Train RMSE: 0.05193366683570815
Fold1 - XGBoost Test RMSE: 0.1153433061435926
Processing fold2...
Fold2 - XGBoost Train RMSE: 0.049750192640706646
Fold2 - XGBoost Test RMSE: 0.11791694689180016
Processing fold3...
Fold3 - XGBoost Train RMSE: 0.051731474230395945
Fold3 - XGBoost Test RMSE: 0.11613088454525501
Processing fold4...
Fold4 - XGBoost Train RMSE: 0.05140311661923465
Fold4 - XGBoost Test RMSE: 0.10989938310462323
Processing fold5...
Fold5 - XGBoost Train RMSE: 0.05248526007996803
Fold5 - XGBoost Test RMSE: 0.10775687038225883
Processing fold6...
Fold6 - XGBoost Train RMSE: 0.05140486676469129
Fold6 - XGBoost Test RMSE: 0.12788898422166814
Processing fold7...
Fold7 - XGBoost Train RMSE: 0.05048320241997869
Fold7 - XGBoost Test RMSE: 0.131695504551595
Processing fold8...
Fold8 - XGBoost Train RMSE: 0.051276160306289884
Fold8 - XGBoost Test RMSE: 0.12434747652836972
Processing fold9...
Fold9 - XGBoost Train RMSE: 0.052266051753495434
Fold9 - XGB