In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from matplotlib.gridspec import GridSpec
import lightgbm as lgb
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
imputed_data = pd.read_csv("imputed_data_handle_multicollinearity.csv")
imputed_data=imputed_data.drop('Unnamed: 0',axis=1)
imputed_data.head()

Unnamed: 0,GrLivArea,SalePrice,OverallQual,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,BsmtFinType2_Unknown,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y
0,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
1,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,0,1
2,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
3,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,0,0,0,0,0,1,1
4,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,1,0,0,0,0,0,1


In [3]:
# Separate the independent variables (features) from the dependent variable (target)
X = imputed_data.drop('SalePrice', axis=1)
y = imputed_data['SalePrice']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Set up KFold with shuffle=True
kf = KFold(n_splits=5, shuffle=True)

# Instantiate the XGBRegressor
model = XGBRegressor()

# Define the parameter distribution for RandomizedSearchCV
param_dist = {'n_estimators': [100, 200, 300, 400, 500],
              'max_depth': [3, 5, 7, 10],
              'learning_rate': [0.01, 0.1, 0.5, 1],
              'gamma': [0, 0.1, 0.5, 1],
              'min_child_weight': [1, 5, 10],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1]}


# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=25, cv=kf, n_jobs=-1, error_score='raise')

try:
    # Fit the RandomizedSearchCV object to the training data
    random_search.fit(X_train, y_train)

except Exception as e:
    # Handle any errors that arise during fitting
    print("Error occurred during fitting:", e)
    
# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Train a XGBRegressor with cross-validation to get an estimate of the model's performance
train_r2_scores = []
test_r2_scores = []
for train_index, val_index in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    model = XGBRegressor(**random_search.best_params_)
    model.fit(X_train_fold, y_train_fold)
    train_r2_scores.append(model.score(X_train_fold, y_train_fold))
    test_r2_scores.append(model.score(X_val_fold, y_val_fold))
train_r2 = np.mean(train_r2_scores)
test_r2 = np.mean(test_r2_scores)

# Calculate the feature importance scores for each feature in the model
importances = model.feature_importances_

# Rank the features based on their importance scores
indices = importances.argsort()[::-1]

# Define the number of features
num_features = X_train.shape[1]

# Create empty lists to store results
num_features_list = []
test_r2_list = []

# Loop through the number of features and fit the model
for i in tqdm(range(1, 11)):
    selected_features = X_train.columns[indices[:i]]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    model_selected = XGBRegressor(**random_search.best_params_)
    model_selected.fit(X_train_selected, y_train)
    test_r2_selected = model_selected.score(X_test_selected, y_test)
    num_features_list.append(i)
    test_r2_list.append(test_r2_selected)

Best Hyperparameters: {'subsample': 0.75, 'n_estimators': 400, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:06<00:00,  1.43it/s]


n_estimators: the number of trees in the model. Higher values generally lead to better performance, but may also increase the risk of overfitting.

max_depth: the maximum depth of each tree. Higher values may capture more complex interactions in the data, but may also increase the risk of overfitting.

learning_rate: the step size shrinkage used in each boosting iteration. Lower values generally lead to better performance, but may require a larger number of iterations to converge.

gamma: the minimum loss reduction required to make a split in a leaf node. Higher values may lead to simpler trees and reduce overfitting, but may also underfit the data.

min_child_weight: the minimum sum of instance weight required in a child node. Higher values may lead to simpler trees and reduce overfitting, but may also underfit the data.

subsample: the fraction of instances to be randomly sampled for each tree. Lower values may reduce overfitting, but may also underfit the data.

colsample_bytree: the fraction of features to be randomly sampled for each tree. Lower values may reduce overfitting, but may also underfit the data.

In [4]:
# Create a dataframe to store the results
results_df = pd.DataFrame({'importance_rank': range(1, len(selected_features)+1),
                           'feature_name': selected_features,
                           'test_r2': test_r2_list})

# Sort the dataframe by test_r2 in ascending order
results_df = results_df.sort_values(by='test_r2')
results_df['importance_rank'] = range(1, len(selected_features)+1)

# Print the ranked feature list
print(results_df[['importance_rank', 'feature_name', 'test_r2']].head(10))

   importance_rank  feature_name   test_r2
0                1   OverallQual  0.692702
1                2     YearBuilt  0.726298
2                3    Fireplaces  0.764811
3                4     GrLivArea  0.953822
4                5   TotalBsmtSF  0.965374
5                6    GarageArea  0.980969
6                7      FullBath  0.982726
9                8      1stFlrSF  0.982771
8                9  YearRemodAdd  0.984739
7               10  BsmtFullBath  0.985606


In [5]:
# Save the results in the current working directory
results_df.to_csv('results_xgboost.csv', index=False)