In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
data = load_breast_cancer()

In [3]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
df.shape

(569, 31)

In [5]:
X = df.drop(['target'], axis=1)
y = df['target'].astype(float)

In [6]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [7]:
model = DecisionTreeClassifier()

In [8]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=2023)
scores = cross_val_score(model, X, y, scoring = "recall", cv=cv, n_jobs=-1)
print(f"Mean recall: {np.mean(scores)} {np.std(scores)}")
print(scores)

Mean recall: 0.937793427230047 0.030014684604785298
[0.95774648 0.87323944 0.97222222 0.94444444 0.92957746 0.91549296
 0.88732394 0.94444444 0.94444444 0.94366197 0.95774648 0.91549296
 0.95833333 0.94444444 0.91549296 0.95774648 0.94366197 0.97222222
 0.88888889 0.95774648 0.90140845 0.98591549 0.90277778 0.94444444
 0.98591549]


In [9]:
len(scores), max(scores), min(scores)

(25, 0.9859154929577465, 0.8732394366197183)

## Version with subset of features

In [10]:
from itertools import product
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [11]:
n_cols = X.shape[1]
best_subset, best_score = None, 0.0
n_cols

30

In [None]:
for subset in product([True, False], repeat = n_cols):
    ix = [i for i, x in enumerate(subset) if x]

    if len(ix) == 0:
        continue

    X_new = X.iloc[:, ix]

    model = DecisionTreeClassifier()
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=2023)
    scores = cross_val_score(model, X, y, scoring = "recall", cv=cv, n_jobs=-1)

    result = mean(scores)
    print(f"Subset: {ix} Result: {result}")

    if best_score is None or result >= best_score:
        best_subset, best_score = ix, result
    # break

print('Achou!')
print(f"Best Subset: {best_subset} Best Score: {best_score}")

In [12]:
# list(product([True, False], repeat = n_cols))

[(True, True, True),
 (True, True, False),
 (True, False, True),
 (True, False, False),
 (False, True, True),
 (False, True, False),
 (False, False, True),
 (False, False, False)]