In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from matplotlib.gridspec import GridSpec
import lightgbm as lgb
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
import warnings
from sklearn.exceptions import ConvergenceWarning
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
imputed_data = pd.read_csv("imputed_data_handle_multicollinearity.csv")
imputed_data=imputed_data.drop('Unnamed: 0',axis=1)
imputed_data.head()

Unnamed: 0,GrLivArea,SalePrice,OverallQual,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,BsmtFinType2_Unknown,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y
0,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
1,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,0,1
2,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
3,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,0,0,0,0,0,1,1
4,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,1,0,0,0,0,0,1


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint



# Separate the independent variables (features) from the dependent variable (target)
X = imputed_data.drop('SalePrice', axis=1)
y = imputed_data['SalePrice']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Instantiate the LightGBM regressor
model = LGBMRegressor()

# Define the parameter distribution for RandomizedSearchCV
param_dist = {'max_depth': sp_randint(3, 8),
              'num_leaves': sp_randint(10, 31),
              'learning_rate': [0.01, 0.05, 0.1],
              'n_estimators': sp_randint(100, 301),
              'reg_alpha': [0.1, 0.5],
              'reg_lambda': [0.1, 0.5],
              'min_child_samples': sp_randint(5, 16)}

# Set up KFold with shuffle=True
kf = KFold(n_splits=5, shuffle=True)

# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=kf, n_jobs=-1)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Train the LightGBM regressor with the best hyperparameters
best_model = random_search.best_estimator_

# Evaluate the performance of the best model using cross-validation
train_r2_scores = cross_val_score(best_model, X_train, y_train, cv=kf)
train_r2 = np.mean(train_r2_scores)

# Evaluate the performance of the best model on the test set
test_r2 = best_model.score(X_test, y_test)

# Calculate the feature importance scores for each feature in the model
importances = best_model.feature_importances_

# Rank the features based on their importance scores
indices = importances.argsort()[::-1]

# Create empty lists to store results
num_features_list = []
test_r2_list = []

# Loop through the number of features and fit the model
for num_features in tqdm(range(1, 11)):
    selected_features = X_train.columns[indices[:num_features]]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    model_selected = LGBMRegressor(**random_search.best_params_)
    model_selected.fit(X_train_selected, y_train)
    test_r2_selected = model_selected.score(X_test_selected, y_test)
    num_features_list.append(num_features)
    test_r2_list.append(test_r2_selected)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 6, 'min_child_samples': 8, 'n_estimators': 271, 'num_leaves': 30, 'reg_alpha': 0.1, 'reg_lambda': 0.5}


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.16it/s]


n_estimators: The number of estimators is often the most critical hyperparameter in gradient boosting algorithms. Increasing the number of estimators generally improves the model's performance, but there is a tradeoff between performance and training time.

max_depth: The maximum depth of the decision tree controls the complexity of the model. A higher value may lead to overfitting, while a lower value may result in underfitting. It's generally a good practice to set this hyperparameter based on the size of the dataset and the complexity of the problem.

num_leaves: The maximum number of leaves in a decision tree is another important hyperparameter that affects the model's complexity. A higher value can improve the model's performance, but it can also lead to overfitting.

learning_rate: The learning rate controls the step size in gradient boosting algorithms. A lower learning rate requires more iterations to converge, but it can help the model generalize better to unseen data.

min_child_samples: The minimum number of samples required in each leaf node of the decision tree can prevent overfitting by forcing the model to have a minimum number of samples in each leaf.

reg_lambda and reg_alpha: The L1 and L2 regularization terms can help prevent overfitting by penalizing large weights in the model.

In [7]:
# Create a dataframe to store the results
results_df = pd.DataFrame({'importance_rank': range(1, len(selected_features)+1),
                           'feature_name': selected_features,
                           'test_r2': test_r2_list})

# Sort the dataframe by test_r2 in ascending order
results_df = results_df.sort_values(by='test_r2')
results_df['importance_rank'] = range(1, len(selected_features)+1)

# Print the ranked feature list
print(results_df[['importance_rank', 'feature_name', 'test_r2']].head(10))


   importance_rank  feature_name   test_r2
0                1    GarageArea  0.545126
1                2     GrLivArea  0.878894
2                3    BsmtFinSF1  0.932085
4                4   TotalBsmtSF  0.949142
3                5     BsmtUnfSF  0.952895
5                6     YearBuilt  0.972983
7                7  YearRemodAdd  0.974005
8                8   GarageYrBlt  0.975100
9                9      2ndFlrSF  0.975288
6               10      1stFlrSF  0.975858


In [8]:
# Save the results in the current working directory
results_df.to_csv('results_lgbm.csv', index=False)