In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from matplotlib.gridspec import GridSpec
import lightgbm as lgb
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

In [2]:
imputed_data = pd.read_csv("imputed_data_handle_multicollinearity.csv")
imputed_data=imputed_data.drop('Unnamed: 0',axis=1)
imputed_data.head()

Unnamed: 0,GrLivArea,SalePrice,OverallQual,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,BsmtFinType2_Unknown,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y
0,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
1,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,0,1
2,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
3,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,0,0,0,0,0,1,1
4,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,1,0,0,0,0,0,1


In [3]:
# Separate the independent variables (features) from the dependent variable (target)
X = imputed_data.drop('SalePrice', axis=1)
y = imputed_data['SalePrice']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Set up KFold with shuffle=True
kf = KFold(n_splits=5, shuffle=True)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {'n_estimators': [100,200,300,400,500],
              'max_depth': [3,5,7,10],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [2,5,10]}

# Instantiate the RandomForestRegressor
model = RandomForestRegressor()

# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=25, cv=kf, n_jobs=-1, error_score='raise')

try:
    # Fit the RandomizedSearchCV object to the training data
    random_search.fit(X_train, y_train)

except Exception as e:
    # Handle any errors that arise during fitting
    print("Error occurred during fitting:", e)
    
# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Train the RandomForestRegressor with the best hyperparameters
best_model = random_search.best_estimator_

# Evaluate the performance of the best model using cross-validation
train_r2_scores = cross_val_score(best_model, X_train, y_train, cv=kf)
train_r2 = np.mean(train_r2_scores)

# Evaluate the performance of the best model on the test set
test_r2 = best_model.score(X_test, y_test)

# Calculate the feature importance scores for each feature in the model
importances = best_model.feature_importances_

# Rank the features based on their importance scores
indices = importances.argsort()[::-1]

# Create empty lists to store results
num_features_list = []
test_r2_list = []

# Loop through the number of features and fit the model
for num_features in tqdm(range(1, 11)):
    selected_features = X_train.columns[indices[:num_features]]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    model_selected = RandomForestRegressor(**random_search.best_params_)
    model_selected.fit(X_train_selected, y_train)
    test_r2_selected = model_selected.score(X_test_selected, y_test)
    num_features_list.append(num_features)
    test_r2_list.append(test_r2_selected)

Best Hyperparameters: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.66s/it]


n_estimators: The number of trees in the random forest. In this case, the random forest consists of 500 decision trees.

max_depth: The maximum depth of each decision tree in the random forest. A larger value can lead to a more complex and overfitted model, while a smaller value can result in underfitting. Here, the maximum depth is set to 10, which is relatively shallow.

min_samples_split: The minimum number of samples required to split an internal node. This can help prevent overfitting by avoiding splits that result in very few samples in a node. In this case, a minimum of 10 samples is required to split a node.

min_samples_leaf: The minimum number of samples required to be at a leaf node. This can also help prevent overfitting by ensuring that each leaf has a minimum number of samples. In this case, each leaf node must have at least 2 samples.

In [4]:
# Create a dataframe to store the results
results_df = pd.DataFrame({'importance_rank': range(1, len(selected_features)+1),
                           'feature_name': selected_features,
                           'test_r2': test_r2_list})

# Sort the dataframe by test_r2 in ascending order
results_df = results_df.sort_values(by='test_r2')
results_df['importance_rank'] = range(1, len(selected_features)+1)

# Print the ranked feature list
print(results_df[['importance_rank', 'feature_name', 'test_r2']].head(10))

   importance_rank  feature_name   test_r2
0                1   OverallQual  0.664705
1                2      1stFlrSF  0.826147
2                3     GrLivArea  0.891976
3                4   TotalBsmtSF  0.901805
4                5    BsmtFinSF1  0.916577
5                6    GarageArea  0.924519
6                7     YearBuilt  0.934139
7                8    Fireplaces  0.937971
8                9  YearRemodAdd  0.940865
9               10     BsmtUnfSF  0.943039


In [5]:
# Save the results in the current working directory
results_df.to_csv('results_rf.csv', index=False)