<a href="https://colab.research.google.com/github/wolfzxcv/ml-examples/blob/master/SBS_feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score

In [18]:
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

In [19]:
# Export .csv to see the data, you don't need these few lines
import pandas as pd

# Create a DataFrame with the features
df = pd.DataFrame(data=X, columns=feature_names)

# Add the target variable as the last column
df['target'] = y

# Export the DataFrame to a CSV file
df.to_csv('breast_cancer.csv', sep=',', index=False)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

# Define the classifier
clf = SVC()

# Initialize the sequential backward feature selector
# You can also modify the direction, it can be either 'backward' or 'forward', default='forward'. Forward selection: add, backward selection: remove.
# You can modify n_features_to_select, set it to a specific number, n_features_to_select=4, or, n_features_to_select='auto'
sbs = SequentialFeatureSelector(clf, direction='backward', n_features_to_select='auto')

# Fit the sequential backward feature selector to the training data
sbs.fit(X_train, y_train)

# Get selected feature names
selected_feature_names = feature_names[sbs.get_support()]

# Transform data, let training and testing data contain only select features
X_train_selected = sbs.transform(X_train)
X_test_selected = sbs.transform(X_test)

# Train the classifier on the selected features
clf.fit(X_train_selected, y_train)

# Make predictions on the testing data using selected features
y_pred_selected = clf.predict(X_test_selected)

print('Number of selected features:', len(selected_feature_names))

# Print selected feature names
print('Selected features:', ', '.join(selected_feature_names))

# Calculate accuracy using selected features
accuracy_selected = accuracy_score(y_test, y_pred_selected)
print("Subset accuracy:", accuracy_selected)

# Train the classifier on all features
clf.fit(X_train, y_train)

# Make predictions on the testing data using all features
y_pred_all_features = clf.predict(X_test)

# Calculate accuracy using all features
accuracy_all_features = accuracy_score(y_test, y_pred_all_features)
print("All Features Accuracy:", accuracy_all_features)

Number of selected features: 15
Selected features: smoothness error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst radius, worst texture, worst perimeter, worst smoothness, worst compactness, worst concavity, worst concave points, worst symmetry, worst fractal dimension
Subset accuracy: 0.9473684210526315
All Features Accuracy: 0.9122807017543859
