In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from matplotlib.gridspec import GridSpec
import lightgbm as lgb
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
imputed_data = pd.read_csv("imputed_data_handle_multicollinearity.csv")
imputed_data=imputed_data.drop('Unnamed: 0',axis=1)
imputed_data.head()

Unnamed: 0,GrLivArea,SalePrice,OverallQual,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,BsmtFinType2_Unknown,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y
0,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
1,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,0,1
2,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
3,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,0,0,0,0,0,1,1
4,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,1,0,0,0,0,0,1


In [24]:
# Separate the independent variables (features) from the dependent variable (target)
X = imputed_data.drop('SalePrice', axis=1)
y = imputed_data['SalePrice']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Set up KFold with shuffle=True
kf = KFold(n_splits=5, shuffle=True)

# Define base estimators to be used in the ensemble
base_estimators = [('rf', RandomForestRegressor()),
                   ('xgb', XGBRegressor()),
                   ('dt', DecisionTreeRegressor())]

# Define a final estimator to combine the predictions of the base estimators
final_estimator = StackingRegressor(estimators=[('ridge', RidgeCV()),
                                                 ('lasso', LassoCV())])

# Define a StackingRegressor object with the base and final estimator
stacked = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator)

# Initialize empty lists to store results
num_features_list = []
test_r2_list = []

# Loop through the number of features and fit the model
for i in tqdm(range(1,21), desc='Running loop'):
    # Calculate the feature importance scores for each feature in the model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    
    # Rank the features based on their importance scores
    indices = importances.argsort()[::-1]
    
    # Select the top i features
    selected_features = X_train.columns[indices[:i]]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    
    # Define the estimators to be used in the ensemble
    base_estimators_selected = []
    for name, estimator in base_estimators:
        base_estimator = estimator.fit(X_train_selected, y_train)
        base_estimators_selected.append((name, base_estimator))
    
    # Define the StackingRegressor object with the base and final estimator
    stacked = StackingRegressor(estimators=base_estimators_selected, final_estimator=final_estimator)
    
    # Fit the StackingRegressor on the selected features and calculate R2 on the test set
    stacked.fit(X_train_selected, y_train)
    test_r2_selected = stacked.score(X_test_selected, y_test)
    
    # Append the results to the lists
    num_features_list.append(i)
    test_r2_list.append(test_r2_selected)


Running loop: 100%|████████████████████████████████████████████████████████████████████████| 20/20 [05:28<00:00, 16.42s/it]


In [27]:
# Create a dataframe to store the results
results_df = pd.DataFrame({'importance_rank': range(1, len(selected_features)+1),
                           'feature_name': selected_features,
                           'test_r2': test_r2_list})

# Sort the dataframe by test_r2 in ascending order
results_df = results_df.sort_values(by='test_r2')
results_df['importance_rank'] = range(1, len(selected_features)+1)

# Print the ranked feature list
print(results_df[['importance_rank', 'feature_name', 'test_r2']].head(10))


    importance_rank          feature_name   test_r2
0                 1           OverallQual  0.715905
1                 2             GrLivArea  0.936728
2                 3              1stFlrSF  0.962183
3                 4           TotalBsmtSF  0.972689
4                 5            BsmtFinSF1  0.973622
9                 6            Fireplaces  0.979011
7                 7          YearRemodAdd  0.980412
5                 8            GarageArea  0.980464
11                9             BsmtUnfSF  0.981726
18               10  Neighborhood_SawyerW  0.981989


In [28]:
# Save the results in the current working directory
results_df.to_csv('results_stacked_ensemble.csv', index=False)