In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from matplotlib.gridspec import GridSpec
import lightgbm as lgb
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

In [2]:
imputed_data = pd.read_csv("imputed_data_handle_multicollinearity.csv")
imputed_data=imputed_data.drop('Unnamed: 0',axis=1)
imputed_data.head()

Unnamed: 0,GrLivArea,SalePrice,OverallQual,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,BsmtFinType2_Unknown,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y
0,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
1,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,0,1
2,1049.0,139500.0,5.0,1984.0,1984.0,552.0,393.0,104.0,1049.0,1049.0,...,0,0,0,1,0,0,0,0,1,1
3,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,0,0,0,0,0,1,1
4,1001.0,124900.0,5.0,1930.0,2007.0,737.0,0.0,100.0,837.0,1001.0,...,0,0,0,1,0,0,0,0,0,1


In [3]:
# Separate the independent variables (features) from the dependent variable (target)
X = imputed_data.drop('SalePrice', axis=1)
y = imputed_data['SalePrice']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Set up KFold with shuffle=True
kf = KFold(n_splits=5, shuffle=True)

# Define the parameter grid for GridSearchCV
param_grid = {'max_depth': [5, 10, 15, 20, None],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 2, 4, 8, 16],
              'max_features': [None, 'sqrt', 'log2']}

# Instantiate the decision tree regressor
model = DecisionTreeRegressor()

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=kf)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Train the decision tree regressor with the best hyperparameters
best_model = grid_search.best_estimator_

# Evaluate the performance of the best model using cross-validation
train_r2_scores = cross_val_score(best_model, X_train, y_train, cv=kf)
train_r2 = np.mean(train_r2_scores)

# Evaluate the performance of the best model on the test set
test_r2 = best_model.score(X_test, y_test)

# Calculate the feature importance scores for each feature in the model
importances = best_model.feature_importances_

# Rank the features based on their importance scores
indices = importances.argsort()[::-1]

# Create empty lists to store results
num_features_list = []
test_r2_list = []

# Loop through the number of features and fit the model
for num_features in range(1, len(X_train.columns)+1):
    selected_features = X_train.columns[indices[:num_features]]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    model_selected = DecisionTreeRegressor(**grid_search.best_params_)
    model_selected.fit(X_train_selected, y_train)
    test_r2_selected = model_selected.score(X_test_selected, y_test)
    num_features_list.append(num_features)
    test_r2_list.append(test_r2_selected)


Best Hyperparameters: {'max_depth': 15, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4}


max_depth: maximum depth of the decision tree. If None, then nodes are expanded until all the leaves contain less than min_samples_split samples, or until all the leaves are pure.

min_samples_split: the minimum number of samples required to split an internal node. If a split would result in a leaf node with fewer than min_samples_split samples, the split is not performed.
    
min_samples_leaf: the minimum number of samples required to be at a leaf node. A split point that results in a leaf node with fewer than min_samples_leaf samples is ignored.
    
max_features: the maximum number of features to consider when looking for the best split. If None, then all features are considered.

In [4]:
# Create a dataframe to store the results
results_df = pd.DataFrame({'importance_rank': range(1, len(selected_features)+1),
                           'feature_name': selected_features,
                           'test_r2': test_r2_list})

# Sort the dataframe by test_r2 in ascending order
results_df = results_df.sort_values(by='test_r2')
results_df['importance_rank'] = range(1, len(selected_features)+1)

# Print the ranked feature list
print(results_df[['importance_rank', 'feature_name', 'test_r2']].head(10))


     importance_rank         feature_name   test_r2
0                  1          OverallQual  0.730131
1                  2             1stFlrSF  0.906234
57                 3          MSZoning_RH  0.945448
44                 4  Exterior2nd_Wd Shng  0.948806
3                  5          TotalBsmtSF  0.949929
90                 6          BsmtQual_Po  0.950112
112                7  Exterior1st_CemntBd  0.950127
2                  8            GrLivArea  0.950340
74                 9         LotShape_IR3  0.950435
32                10     BsmtFinType1_Rec  0.950620


In [5]:
# Save the results in the current working directory
results_df.to_csv('results_dt.csv', index=False)