In [4]:
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel

Read in the dataframe. Before runnning SBS, apply MinMaxScaler and StandardScaler because we want the data to be almost normally distributed when fitting into the KNN model.

In [6]:
header_list = ['SpMax_L', 'J_Dz(e)', 'nHM', 'F01[N-N]', 'F04[C-N]','NssssC', 'nCb-', 'C%', 'nCp', 'nO', 'F03[C-N]',
               'SdssC', 'HyWi_B(m)', 'LOC', 'SM6_L', 'F03[C-O]', 'Me', 'Mi', 'nN-N', 'nArNO2', 'nCRX3', 'SpPosA_B(p)', 
              'nCIR', 'B01[C-Br]', 'B03[C-Cl]', 'N-073', 'SpMax_A', 'Psi_i_1d', 'B04[C-Br]', 'SdO' , 'TI2_L', 'nCrt',
               'C-026', 'F02[C-N]', 'nHDon', 'SpMax_B(m)', 'Psi_i_A', 'nN', 'SM6_B(m)', 'nArCOOR', 'nX', 'TARGET']
data = pd.read_csv('BioDegData.csv', names = header_list)

columns = data.columns
data.head()

X, y = data.iloc[:, 0:-1].values, data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Normalizing the features to [0,1] using MinMaxScaler
norm = MinMaxScaler()
X_train_norm = norm.fit_transform(X_train)
X_test_norm = norm.transform(X_test)

# Standardizing the features using StandardScaler
std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=4)

Sequential backward selection on all features using KNearestNeighbor and mlxtend feature selection tool SequentialFeatureSelector. Set forward to false in order to select features backwards.

In [7]:
sbs = SFS(knn, 
          k_features=12, 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          cv=4,
          n_jobs=-1)
sbs = sbs.fit(X, y)

print('\nSequential Backward Selection (k=12):')
print(sbs.k_feature_idx_)
print('CV Score:')
print(sbs.k_score_)
sbs = sbs.fit(X, y)


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...


Sequential Backward Selection (k=3):
(0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 31, 33, 35, 37, 38, 39, 40)
CV Score:
0.8663440488535545



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

Print out subsets of each feature with CV score & accuracy.

In [4]:
sbs.subsets_

{41: {'feature_idx': (0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40),
  'cv_scores': array([0.87878788, 0.82954545, 0.76425856, 0.83269962]),
  'avg_score': 0.8263228770595691,
  'feature_names': ('0',
   '1',
   '2',
   '3',
   '4',
   '5',
   '6',
   '7',
   '8',
   '9',
   '10',
   '11',
   '12',
   '13',
   '14',
   '15',
   '16',
   '17',
   '18',
   '19',
   '20',
   '21',
   '22',
   '23',
   '24',
   '25',
   '26',
   '27',
   '28',
   '29',
   '30',
   '31',
   '32',
   '33',
   '34',
   '35',
   '36',
   '37',
   '38',
   '39',
   '40')},
 40: {'feature_idx': (0,
   1,
   2,
   3,
   4,
   5,
   6,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30

Obtain the best features by using the API call on the SBS. 

In [5]:
sbs.k_feature_idx_

(2, 14, 37)

(http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/)