In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from importance_feature_selector import ImportanceFeatureSelector

# https://erdogant.github.io/distfit/pages/html/Parametric.html

In [2]:

X, y = make_classification(n_samples=10000)

X = pd.DataFrame(X, columns=[f'f_{i}' for i in range(X.shape[1])])

fs = ImportanceFeatureSelector(
    estimator=RandomForestClassifier(random_state=123),
    norm=True,
    rand=True, 
    exp=True, 
    choice=True, 
    binom=True,
    cv=True
)

X_selected = fs.fit_transform(X, y)


print("\nAll Features")
print(X.columns.to_list())
estimator = RandomForestClassifier(random_state=1234)
cv = cross_validate(estimator,X, y, scoring=['accuracy','recall','precision'], cv=3)
for k,v in cv.items():
    print(k, np.mean(v))


print("\nReduced Features")
print(X_selected.columns.to_list())
estimator = RandomForestClassifier(random_state=1234)
cv = cross_validate(estimator,X_selected, y, scoring=['accuracy','recall','precision'], cv=3)
for k,v in cv.items():
    print(k, np.mean(v))


All Features
['f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19']
fit_time 2.073451598485311
score_time 0.051587581634521484
test_accuracy 0.9249996385361392
test_recall 0.9297990341907609
test_precision 0.9209569241291509

Reduced Features
['f_1', 'f_5', 'f_6', 'f_13', 'f_14', 'f_15', 'f_19']
fit_time 1.1111986637115479
score_time 0.05002236366271973
test_accuracy 0.9250996185401382
test_recall 0.9225996337347177
test_precision 0.9272358315436975
