## FILTER METHODS

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

### 1. Correlation Coefficient Technique

In [2]:
iris = sns.load_dataset('iris')

In [3]:
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
label_encoder = LabelEncoder()
iris['species'] = label_encoder.fit_transform(iris['species'])


In [6]:
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [7]:
correlation_matrix = iris.corr()

In [8]:
correlation_matrix

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
sepal_length,1.0,-0.11757,0.871754,0.817941,0.782561
sepal_width,-0.11757,1.0,-0.42844,-0.366126,-0.426658
petal_length,0.871754,-0.42844,1.0,0.962865,0.949035
petal_width,0.817941,-0.366126,0.962865,1.0,0.956547
species,0.782561,-0.426658,0.949035,0.956547,1.0


In [9]:
## Correlation with Target variable
correlation_with_target = correlation_matrix['species'].sort_values(ascending=False)

In [10]:
correlation_with_target

species         1.000000
petal_width     0.956547
petal_length    0.949035
sepal_length    0.782561
sepal_width    -0.426658
Name: species, dtype: float64

In [11]:
selected_features = correlation_with_target[abs(correlation_with_target) > 0.5].index

In [12]:
selected_features

Index(['species', 'petal_width', 'petal_length', 'sepal_length'], dtype='object')

In [13]:
new_df = iris[selected_features]

In [14]:
new_df

Unnamed: 0,species,petal_width,petal_length,sepal_length
0,0,0.2,1.4,5.1
1,0,0.2,1.4,4.9
2,0,0.2,1.3,4.7
3,0,0.2,1.5,4.6
4,0,0.2,1.4,5.0
...,...,...,...,...
145,2,2.3,5.2,6.7
146,2,1.9,5.0,6.3
147,2,2.0,5.2,6.5
148,2,2.3,5.4,6.2


### 2. Chi-Square Test Technique

In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import KBinsDiscretizer

In [16]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [17]:
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [18]:
### Converting to Categorical as Ch-Square only works on Categorical features
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
df_discretized = pd.DataFrame(discretizer.fit_transform(df.iloc[:, :-1]), columns=df.columns[:-1])

In [19]:
df_discretized

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,5.0,0.0,5.0,3.0,5.0,7.0,7.0,7.0,6.0,6.0,...,6.0,1.0,6.0,4.0,6.0,6.0,5.0,9.0,5.0,4.0
1,6.0,2.0,6.0,5.0,2.0,1.0,2.0,3.0,3.0,1.0,...,6.0,3.0,5.0,4.0,3.0,1.0,1.0,6.0,2.0,2.0
2,6.0,3.0,5.0,4.0,5.0,4.0,4.0,6.0,5.0,2.0,...,5.0,3.0,5.0,3.0,4.0,3.0,3.0,8.0,4.0,2.0
3,2.0,3.0,2.0,1.0,8.0,8.0,5.0,5.0,7.0,9.0,...,2.0,3.0,2.0,0.0,9.0,8.0,5.0,8.0,9.0,7.0
4,6.0,1.0,6.0,4.0,4.0,3.0,4.0,5.0,3.0,1.0,...,5.0,1.0,5.0,3.0,4.0,1.0,3.0,5.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,6.0,4.0,6.0,5.0,5.0,2.0,5.0,6.0,3.0,1.0,...,6.0,3.0,5.0,4.0,4.0,1.0,3.0,7.0,0.0,1.0
565,6.0,6.0,6.0,4.0,4.0,2.0,3.0,4.0,3.0,1.0,...,5.0,6.0,5.0,3.0,3.0,1.0,2.0,5.0,1.0,0.0
566,4.0,6.0,4.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,...,3.0,5.0,3.0,2.0,2.0,2.0,2.0,4.0,1.0,1.0
567,6.0,6.0,6.0,4.0,5.0,7.0,8.0,7.0,6.0,4.0,...,6.0,7.0,6.0,4.0,6.0,8.0,7.0,9.0,4.0,4.0


In [20]:
df_discretized['target'] = df['target']

In [21]:
df_discretized

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,5.0,0.0,5.0,3.0,5.0,7.0,7.0,7.0,6.0,6.0,...,1.0,6.0,4.0,6.0,6.0,5.0,9.0,5.0,4.0,0
1,6.0,2.0,6.0,5.0,2.0,1.0,2.0,3.0,3.0,1.0,...,3.0,5.0,4.0,3.0,1.0,1.0,6.0,2.0,2.0,0
2,6.0,3.0,5.0,4.0,5.0,4.0,4.0,6.0,5.0,2.0,...,3.0,5.0,3.0,4.0,3.0,3.0,8.0,4.0,2.0,0
3,2.0,3.0,2.0,1.0,8.0,8.0,5.0,5.0,7.0,9.0,...,3.0,2.0,0.0,9.0,8.0,5.0,8.0,9.0,7.0,0
4,6.0,1.0,6.0,4.0,4.0,3.0,4.0,5.0,3.0,1.0,...,1.0,5.0,3.0,4.0,1.0,3.0,5.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,6.0,4.0,6.0,5.0,5.0,2.0,5.0,6.0,3.0,1.0,...,3.0,5.0,4.0,4.0,1.0,3.0,7.0,0.0,1.0,0
565,6.0,6.0,6.0,4.0,4.0,2.0,3.0,4.0,3.0,1.0,...,6.0,5.0,3.0,3.0,1.0,2.0,5.0,1.0,0.0,0
566,4.0,6.0,4.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,...,5.0,3.0,2.0,2.0,2.0,2.0,4.0,1.0,1.0,0
567,6.0,6.0,6.0,4.0,5.0,7.0,8.0,7.0,6.0,4.0,...,7.0,6.0,4.0,6.0,8.0,7.0,9.0,4.0,4.0,0


In [22]:
df_discretized["target"].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [23]:
X = df_discretized.drop('target', axis=1)
y = df_discretized['target']

In [24]:
# Select the top 10 features using Chi-Square test
chi2_selector = SelectKBest(chi2, k=10)
X_kbest = chi2_selector.fit_transform(X, y)

In [25]:
selected_features = X.columns[chi2_selector.get_support()]

In [26]:
selected_features

Index(['mean perimeter', 'mean area', 'mean concavity', 'mean concave points',
       'radius error', 'worst radius', 'worst perimeter', 'worst area',
       'worst concavity', 'worst concave points'],
      dtype='object')

In [27]:
new_df = df[selected_features]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [28]:
new_df

Unnamed: 0,mean perimeter,mean area,mean concavity,mean concave points,radius error,worst radius,worst perimeter,worst area,worst concavity,worst concave points,target
0,122.80,1001.0,0.30010,0.14710,1.0950,25.380,184.60,2019.0,0.7119,0.2654,0
1,132.90,1326.0,0.08690,0.07017,0.5435,24.990,158.80,1956.0,0.2416,0.1860,0
2,130.00,1203.0,0.19740,0.12790,0.7456,23.570,152.50,1709.0,0.4504,0.2430,0
3,77.58,386.1,0.24140,0.10520,0.4956,14.910,98.87,567.7,0.6869,0.2575,0
4,135.10,1297.0,0.19800,0.10430,0.7572,22.540,152.20,1575.0,0.4000,0.1625,0
...,...,...,...,...,...,...,...,...,...,...,...
564,142.00,1479.0,0.24390,0.13890,1.1760,25.450,166.10,2027.0,0.4107,0.2216,0
565,131.20,1261.0,0.14400,0.09791,0.7655,23.690,155.00,1731.0,0.3215,0.1628,0
566,108.30,858.1,0.09251,0.05302,0.4564,18.980,126.70,1124.0,0.3403,0.1418,0
567,140.10,1265.0,0.35140,0.15200,0.7260,25.740,184.60,1821.0,0.9387,0.2650,0


In [29]:
new_df["target"].value_counts()

target
1    357
0    212
Name: count, dtype: int64

### 3. ANOVA TECHNIQUE

In [30]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif

In [31]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [32]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [33]:
X = df.drop('target', axis=1)
y = df['target']

In [34]:
anova_selector = SelectKBest(f_classif, k=2)  # Select top 2 features
X_kbest = anova_selector.fit_transform(X, y)

In [35]:
selected_features = X.columns[anova_selector.get_support()]

In [36]:
selected_features

Index(['petal length (cm)', 'petal width (cm)'], dtype='object')

In [37]:
new_df = df[selected_features]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [38]:
new_df

Unnamed: 0,petal length (cm),petal width (cm),target
0,1.4,0.2,0
1,1.4,0.2,0
2,1.3,0.2,0
3,1.5,0.2,0
4,1.4,0.2,0
...,...,...,...
145,5.2,2.3,2
146,5.0,1.9,2
147,5.2,2.0,2
148,5.4,2.3,2


### 4. Mutual Information Technique

In [39]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import mutual_info_classif, SelectKBest

In [40]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [41]:
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [43]:
X = df.drop('target', axis=1)
y = df['target']

In [45]:
X.shape

(569, 30)

In [46]:
mi_selector = SelectKBest(mutual_info_classif, k=10)  # Select top 10 features
X_kbest = mi_selector.fit_transform(X, y)

In [47]:
selected_features = X.columns[mi_selector.get_support()]

In [48]:
selected_features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [49]:
new_df = df[selected_features]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [50]:
new_df

Unnamed: 0,mean radius,mean perimeter,mean area,mean concavity,mean concave points,area error,worst radius,worst perimeter,worst area,worst concave points,target
0,17.99,122.80,1001.0,0.30010,0.14710,153.40,25.380,184.60,2019.0,0.2654,0
1,20.57,132.90,1326.0,0.08690,0.07017,74.08,24.990,158.80,1956.0,0.1860,0
2,19.69,130.00,1203.0,0.19740,0.12790,94.03,23.570,152.50,1709.0,0.2430,0
3,11.42,77.58,386.1,0.24140,0.10520,27.23,14.910,98.87,567.7,0.2575,0
4,20.29,135.10,1297.0,0.19800,0.10430,94.44,22.540,152.20,1575.0,0.1625,0
...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,142.00,1479.0,0.24390,0.13890,158.70,25.450,166.10,2027.0,0.2216,0
565,20.13,131.20,1261.0,0.14400,0.09791,99.04,23.690,155.00,1731.0,0.1628,0
566,16.60,108.30,858.1,0.09251,0.05302,48.55,18.980,126.70,1124.0,0.1418,0
567,20.60,140.10,1265.0,0.35140,0.15200,86.22,25.740,184.60,1821.0,0.2650,0


### 5. Variance Threshold Technique

In [3]:
from sklearn.datasets import load_iris

In [4]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

In [5]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [6]:
variances = df.var()

In [7]:
variances

sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
target               0.671141
dtype: float64

In [8]:
from sklearn.feature_selection import VarianceThreshold

In [9]:
X = df.drop('target', axis=1)
y = df['target']

In [10]:
selector = VarianceThreshold(threshold=0.5)
X_transformed = selector.fit_transform(X)

In [11]:
X_transformed

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2],
       [5.4, 1.7, 0.4],
       [4.6, 1.4, 0.3],
       [5. , 1.5, 0.2],
       [4.4, 1.4, 0.2],
       [4.9, 1.5, 0.1],
       [5.4, 1.5, 0.2],
       [4.8, 1.6, 0.2],
       [4.8, 1.4, 0.1],
       [4.3, 1.1, 0.1],
       [5.8, 1.2, 0.2],
       [5.7, 1.5, 0.4],
       [5.4, 1.3, 0.4],
       [5.1, 1.4, 0.3],
       [5.7, 1.7, 0.3],
       [5.1, 1.5, 0.3],
       [5.4, 1.7, 0.2],
       [5.1, 1.5, 0.4],
       [4.6, 1. , 0.2],
       [5.1, 1.7, 0.5],
       [4.8, 1.9, 0.2],
       [5. , 1.6, 0.2],
       [5. , 1.6, 0.4],
       [5.2, 1.5, 0.2],
       [5.2, 1.4, 0.2],
       [4.7, 1.6, 0.2],
       [4.8, 1.6, 0.2],
       [5.4, 1.5, 0.4],
       [5.2, 1.5, 0.1],
       [5.5, 1.4, 0.2],
       [4.9, 1.5, 0.2],
       [5. , 1.2, 0.2],
       [5.5, 1.3, 0.2],
       [4.9, 1.4, 0.1],
       [4.4, 1.3, 0.2],
       [5.1, 1.5, 0.2],
       [5. , 1.3, 0.3],
       [4.5, 1.3

In [12]:
selected_features = X.columns[selector.get_support()]

In [13]:
selected_features

Index(['sepal length (cm)', 'petal length (cm)', 'petal width (cm)'], dtype='object')

In [14]:
new_df = df[selected_features]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [15]:
new_df

Unnamed: 0,sepal length (cm),petal length (cm),petal width (cm),target
0,5.1,1.4,0.2,0
1,4.9,1.4,0.2,0
2,4.7,1.3,0.2,0
3,4.6,1.5,0.2,0
4,5.0,1.4,0.2,0
...,...,...,...,...
145,6.7,5.2,2.3,2
146,6.3,5.0,1.9,2
147,6.5,5.2,2.0,2
148,6.2,5.4,2.3,2


## WRAPPER METHODS

### 1.Forward Selection

In [16]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [19]:
X = df.drop('target', axis=1)
y = df['target']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
selected_features = []
best_accuracy = 0.0

In [22]:
len(selected_features)

0

In [23]:
X.shape[1]

30

In [24]:
X.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [25]:
while len(selected_features) < X.shape[1]:  # Continue until all features are selected
    best_feature = None
    feature_accuracy = 0.0
    
    # Iterate through each feature
    for feature in X.columns:
        if feature not in selected_features:
            # Add the feature to the selected features
            features_to_use = selected_features + [feature]
            
            # Train a classifier (Logistic Regression in this case)
            clf = LogisticRegression(max_iter=10000, random_state=42)
            clf.fit(X_train[features_to_use], y_train)
            
            # Predict on the test set
            y_pred = clf.predict(X_test[features_to_use])
            
            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            
            # Check if this feature improves accuracy
            if accuracy > feature_accuracy:
                feature_accuracy = accuracy
                best_feature = feature
    
    # Add the best feature to the selected features
    selected_features.append(best_feature)
    
    # Update the best accuracy achieved so far
    if feature_accuracy > best_accuracy:
        best_accuracy = feature_accuracy
    
    # Print progress
    print(f"Selected features: {selected_features}")
    print(f"Best accuracy: {best_accuracy:.4f}\n")

Selected features: ['worst perimeter']
Best accuracy: 0.9649

Selected features: ['worst perimeter', 'mean radius']
Best accuracy: 0.9737

Selected features: ['worst perimeter', 'mean radius', 'area error']
Best accuracy: 0.9912

Selected features: ['worst perimeter', 'mean radius', 'area error', 'worst concavity']
Best accuracy: 1.0000

Selected features: ['worst perimeter', 'mean radius', 'area error', 'worst concavity', 'mean perimeter']
Best accuracy: 1.0000

Selected features: ['worst perimeter', 'mean radius', 'area error', 'worst concavity', 'mean perimeter', 'mean area']
Best accuracy: 1.0000

Selected features: ['worst perimeter', 'mean radius', 'area error', 'worst concavity', 'mean perimeter', 'mean area', 'mean smoothness']
Best accuracy: 1.0000

Selected features: ['worst perimeter', 'mean radius', 'area error', 'worst concavity', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness']
Best accuracy: 1.0000

Selected features: ['worst perimeter', 'mean radius

In [26]:
new_df = df[['worst perimeter', 'mean radius', 'area error', 'worst concavity']]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [27]:
new_df

Unnamed: 0,worst perimeter,mean radius,area error,worst concavity,target
0,184.60,17.99,153.40,0.7119,0
1,158.80,20.57,74.08,0.2416,0
2,152.50,19.69,94.03,0.4504,0
3,98.87,11.42,27.23,0.6869,0
4,152.20,20.29,94.44,0.4000,0
...,...,...,...,...,...
564,166.10,21.56,158.70,0.4107,0
565,155.00,20.13,99.04,0.3215,0
566,126.70,16.60,48.55,0.3403,0
567,184.60,20.60,86.22,0.9387,0


### 2. Recursive Feature Elimination (RFE)

In [28]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [29]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [30]:
X = df.drop('target', axis=1)
y = df['target']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
clf = LogisticRegression(max_iter=10000, random_state=42)

In [33]:
### Select top 10 features
rfe = RFE(clf, n_features_to_select=10)

In [34]:
rfe.fit(X_train, y_train)

In [50]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [51]:
feature_names = np.array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'])

In [52]:
feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [53]:
selected_features = feature_names[rfe.support_]

In [54]:
selected_features

array(['mean radius', 'mean compactness', 'mean concavity',
       'texture error', 'worst radius', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry'], dtype='<U23')

In [55]:
y_pred = rfe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [56]:
accuracy

0.9736842105263158

In [57]:
new_df = df[selected_features]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [58]:
new_df

Unnamed: 0,mean radius,mean compactness,mean concavity,texture error,worst radius,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,target
0,17.99,0.27760,0.30010,0.9053,25.380,0.16220,0.66560,0.7119,0.2654,0.4601,0
1,20.57,0.07864,0.08690,0.7339,24.990,0.12380,0.18660,0.2416,0.1860,0.2750,0
2,19.69,0.15990,0.19740,0.7869,23.570,0.14440,0.42450,0.4504,0.2430,0.3613,0
3,11.42,0.28390,0.24140,1.1560,14.910,0.20980,0.86630,0.6869,0.2575,0.6638,0
4,20.29,0.13280,0.19800,0.7813,22.540,0.13740,0.20500,0.4000,0.1625,0.2364,0
...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,0.11590,0.24390,1.2560,25.450,0.14100,0.21130,0.4107,0.2216,0.2060,0
565,20.13,0.10340,0.14400,2.4630,23.690,0.11660,0.19220,0.3215,0.1628,0.2572,0
566,16.60,0.10230,0.09251,1.0750,18.980,0.11390,0.30940,0.3403,0.1418,0.2218,0
567,20.60,0.27700,0.35140,1.5950,25.740,0.16500,0.86810,0.9387,0.2650,0.4087,0


## HYBRID METHOD

### BORUTA

In [None]:
### Miron and Witold

In [None]:
!pip install boruta

In [59]:
from sklearn.datasets import load_breast_cancer

In [60]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [61]:
X = df.drop('target', axis=1).values
Y = df['target'].values

feature_names = df.drop('target', axis=1).columns

In [63]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [64]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

boruta_selector = BorutaPy(rf_classifier, n_estimators='auto', verbose=2, random_state=42)

In [65]:
try:
    boruta_selector.fit(X, Y)
except Exception as e:
    print(f"Error occurred during Boruta fitting: {e}")

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	30
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	13 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	14 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	15 / 100
Confirmed: 	20
Tentative: 	10
Rejected: 	0
Iteration: 	16 / 100
Confirmed: 	21
Tentative: 	9
Reject

In [66]:
selected_features = np.array(feature_names)[boruta_selector.support_]

In [67]:
selected_features

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'radius error', 'perimeter error',
       'area error', 'concavity error', 'concave points error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry',
       'worst fractal dimension'], dtype=object)

In [68]:
len(selected_features)

23

In [69]:
new_df = df[selected_features]
new_df["target"] = df["target"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["target"] = df["target"]


In [70]:
new_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,radius error,perimeter error,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,1.0950,8.589,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.5435,3.398,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.7456,4.585,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.4956,3.445,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.7572,5.438,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,1.1760,7.673,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.7655,5.203,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.4564,3.425,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.7260,5.772,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0
