In [None]:
# Use sequential feature selection to select the best features for the cancer dataset using k-nearest neighbors
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

In [None]:
# Load the cancer data set
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)
# compute accuracy on the training set
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
y_pred_train = knn.predict(X_train_scaled)
print(
    "The accuracy of the k-NN classifier on the train/test set is"
    f" {accuracy_score(y_train, y_pred_train):.2f}/{accuracy_score(y_test, y_pred):.2f}"
)

print("\nSequential feature selection...")
for num_features in range(1, 10):
    sfs = SequentialFeatureSelector(knn, n_features_to_select=num_features, direction="forward", n_jobs=4)
    sfs = sfs.fit(X_train_scaled, y_train)

    # Print the selected features
    feature_mask = sfs.get_support()
    selected_features = np.array(data.feature_names)[feature_mask]
    print(f"-- The {num_features} selected features are:", selected_features)

    # Transform the data to include only the selected features
    X_train_sfs = sfs.transform(X_train_scaled)
    X_test_sfs = sfs.transform(X_test_scaled)

    # Train the k-NN classifier on the selected features
    knn.fit(X_train_sfs, y_train)
    y_pred = knn.predict(X_test_sfs)
    y_pred_train = knn.predict(X_train_sfs)
    print(
        "  --> The accuracy of the k-NN classifier on the train/test set is"
        f" {accuracy_score(y_train, y_pred_train):.2f}/{accuracy_score(y_test, y_pred):.2f}"
    )
