In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from boruta import BorutaPy
from scipy.stats import pearsonr, spearmanr, kendalltau


In [4]:
def feature_selection_and_correlation(data, target_col, n_features=5):
    # Data Prepration    
    X = data.drop(columns=[target_col])
    y = data[target_col]
    
    results = {}
    
    # Boruta Algorithm
    rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
    boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=1)
    boruta.fit(X.values, y.values)
    boruta_features = X.columns[boruta.support_].tolist()
    results['boruta'] = boruta_features[:min(n_features, len(boruta_features))]
    
    # XGBoost
    xgb = XGBRegressor(n_estimators=100)
    xgb.fit(X, y)
    xgb_importance = pd.DataFrame({'feature': X.columns, 'importance': xgb.feature_importances_})
    xgb_importance = xgb_importance.sort_values('importance', ascending=False)
    results['xgboost'] = xgb_importance['feature'].tolist()[:min(n_features, len(X.columns))]
    
    # Random Forest
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X, y)
    rf_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
    rf_importance = rf_importance.sort_values('importance', ascending=False)
    results['random_forest'] = rf_importance['feature'].tolist()[:min(n_features, len(X.columns))]
    
    # Logistic Regression
    lr = LogisticRegression(penalty='l1', solver='liblinear')
    lr.fit(X, y)
    lr_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(lr.coef_[0])})
    lr_importance = lr_importance.sort_values('importance', ascending=False)
    results['logistic_regression'] = lr_importance['feature'].tolist()[:min(n_features, len(X.columns))]
    
    # Elastic Net
    en = ElasticNet(alpha=1, l1_ratio=0.5)
    en.fit(X, y)
    en_importance = pd.DataFrame({'feature': X.columns, 'importance': np.abs(en.coef_)})
    en_importance = en_importance.sort_values('importance', ascending=False)
    results['elastic_net'] = en_importance['feature'].tolist()[:min(n_features, len(X.columns))]
    
    # Calculate correlations
    correlations = {}
    for method, features in results.items():
        pearson = [pearsonr(X[feature], y)[0] for feature in features]
        spearman = [spearmanr(X[feature], y)[0] for feature in features]
        kendall = [kendalltau(X[feature], y)[0] for feature in features]
        correlations[method] = pd.DataFrame({
            'Feature': features,
            'Pearson': pearson,
            'Spearman': spearman,
            'Kendall': kendall
        })
    
    return {'selected_features': results, 'correlations': correlations}

In [None]:
result = feature_selection_and_correlation(data, 'target')
print(result['selected_features'])
print(result['correlations'])