Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB

#CNN tools
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
tf.keras.backend.set_floatx('float64')
#random_state_seed = 0
#tf.random.set_seed(random_state_seed)
#np.random.seed(random_state_seed)

#tools and metrics
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score


Simple data preprocessing without feature selection

In [2]:
df = pd.read_csv('breast-cancer.csv')
                 
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int) 

In [4]:
X_cols = df.columns.to_list()
X_cols.remove('id')
X_cols.remove('diagnosis')
X = df[X_cols]
y = df['diagnosis']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=10)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test)

Creating models

In [7]:
def train_model(model, X_train, y_train, X_test,y_test, model_name):

    model.fit(X_train, y_train)  

    y_pred = model.predict(X_test)

    accuracy            = accuracy_score(y_test, y_pred)
    f1                  = f1_score(y_test, y_pred)
    precision           = precision_score(y_test, y_pred)
    recall              = recall_score(y_test, y_pred)

    result_df = pd.DataFrame([[accuracy, f1, precision, recall]], columns=['accuracy', 'f1_score', 'precision', 'recall'])
    result_df.index = [model_name]
    return result_df

In [8]:
tree = DecisionTreeClassifier(criterion="entropy")

results = train_model(tree, X_train, y_train, X_test, y_test, 'Decision_Tree')

In [9]:
forest_100 = RandomForestClassifier(n_estimators=100,criterion="gini")

results = results.append(train_model(forest_100, X_train, y_train, X_test, y_test, 'Random_Forest')) 

In [10]:
KNN_5 = KNeighborsClassifier(n_neighbors=5)

results = results.append(train_model(KNN_5, X_train, y_train, X_test, y_test, 'KNN_5')) 

In [11]:
KNN_10 = KNeighborsClassifier(n_neighbors=10)

results = results.append(train_model(KNN_10, X_train, y_train, X_test, y_test, 'KNN_10')) 

In [12]:
bayes = GaussianNB()

results = results.append(train_model(bayes, X_train, y_train, X_test, y_test, 'Naive_Bayes')) 

In [13]:
svm = SGDClassifier(loss= "hinge")

results = results.append(train_model(svm, X_train, y_train, X_test, y_test, 'SVM')) 

In [14]:
simple_ann = Sequential()
simple_ann.add(Dense(units=10,activation='relu',))
simple_ann.add(Dense(units=1,activation='sigmoid'))
simple_ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
simple_ann.fit(X_train, y_train, batch_size=32, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1deac1d9e50>

In [15]:
probability_pred = simple_ann.predict(X_test)
y_pred = (probability_pred > 0.4)
accuracy =          accuracy_score(y_test, y_pred)
f1 =                f1_score(y_test, y_pred)
precision =         precision_score(y_test, y_pred)
recall =            recall_score(y_test, y_pred)
simple_ann_result = pd.DataFrame([[accuracy, f1, precision, recall]], columns=['accuracy', 'f1_score', 'precision', 'recall'])
simple_ann_result.index = ['Simple_Neural_Network']
results = results.append(simple_ann_result)

In [16]:
complex_ann = Sequential()
complex_ann.add(Dense(units=32,activation='relu'))
complex_ann.add(Dense(units=8,activation='relu'))
complex_ann.add(Dense(units=1,activation='sigmoid'))
complex_ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
complex_ann.fit(X_train, y_train, batch_size=32, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1dead67e970>

In [17]:
probability_pred = complex_ann.predict(X_test)
y_pred = (probability_pred > 0.4)
accuracy =          accuracy_score(y_test, y_pred)
f1 =                f1_score(y_test, y_pred)
precision =         precision_score(y_test, y_pred)
recall =            recall_score(y_test, y_pred)
complex_ann_result = pd.DataFrame([[accuracy, f1, precision, recall]], columns=['accuracy', 'f1_score', 'precision', 'recall'])
complex_ann_result.index = ['Complex_Neural_Network']
results = results.append(complex_ann_result)

In [18]:
results.style.background_gradient(cmap = sns.color_palette("flare", as_cmap=True))

Unnamed: 0,accuracy,f1_score,precision,recall
Decision_Tree,0.94152,0.918033,0.888889,0.949153
Random_Forest,0.982456,0.975207,0.951613,1.0
KNN_5,0.982456,0.974359,0.982759,0.966102
KNN_10,0.964912,0.947368,0.981818,0.915254
Naive_Bayes,0.947368,0.92562,0.903226,0.949153
SVM,0.935673,0.910569,0.875,0.949153
Simple_Neural_Network,0.982456,0.97479,0.966667,0.983051
Complex_Neural_Network,0.982456,0.97479,0.966667,0.983051


Second attempt with more data preprocessing, with feature selection based on correlation

In [19]:
df = pd.read_csv('breast-cancer.csv')

In [20]:
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int) 
df = df.drop('id',axis=1)

In [21]:
correlations = df.corr()['diagnosis']
correlations = correlations.drop('diagnosis',axis=0).rename('correlation with diagnosis')
abs_correlations = abs(correlations.rename('abs correlation with diagnosis'))
all_correlations = pd.concat([correlations,abs_correlations],axis=1)
pd.set_option("display.max_rows", 40)
all_correlations.sort_values(by='abs correlation with diagnosis', ascending=False)


Unnamed: 0,correlation with diagnosis,abs correlation with diagnosis
concave points_worst,0.793566,0.793566
perimeter_worst,0.782914,0.782914
concave points_mean,0.776614,0.776614
radius_worst,0.776454,0.776454
perimeter_mean,0.742636,0.742636
area_worst,0.733825,0.733825
radius_mean,0.730029,0.730029
area_mean,0.708984,0.708984
concavity_mean,0.69636,0.69636
concavity_worst,0.65961,0.65961


Choosing most relevent features

In [22]:
relevant_features = abs_correlations[abs_correlations>0.1] 
relevant_names = [index for index, _ in relevant_features.iteritems()]

In [23]:
X = df[relevant_names]
y = df['diagnosis']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=10)


In [25]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
tree = DecisionTreeClassifier(criterion="entropy")

results2 = train_model(tree, X_train, y_train, X_test, y_test, 'Decision_Tree')

In [27]:
forest_100 = RandomForestClassifier(n_estimators=100,criterion="gini")

results2 = results2.append(train_model(forest_100, X_train, y_train, X_test, y_test, 'Random_Forest')) 

In [28]:
KNN_5 = KNeighborsClassifier(n_neighbors=5)

results2 = results2.append(train_model(KNN_5, X_train, y_train, X_test, y_test, 'KNN_5')) 

In [29]:
KNN_10 = KNeighborsClassifier(n_neighbors=10)

results2 = results2.append(train_model(KNN_10, X_train, y_train, X_test, y_test, 'KNN_10')) 

In [30]:
bayes = GaussianNB()

results2 = results2.append(train_model(bayes, X_train, y_train, X_test, y_test, 'Naive_Bayes')) 

In [31]:
svm = SGDClassifier(loss= "hinge")

results2 = results2.append(train_model(svm, X_train, y_train, X_test, y_test, 'SVM')) 

In [32]:
simple_ann = Sequential()
simple_ann.add(Dense(units=10,activation='relu',))
simple_ann.add(Dense(units=1,activation='sigmoid'))
simple_ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
simple_ann.fit(X_train, y_train, batch_size=32, epochs=200)

probability_pred = simple_ann.predict(X_test)
y_pred = (probability_pred > 0.4)
accuracy =          accuracy_score(y_test, y_pred)
f1 =                f1_score(y_test, y_pred)
precision =         precision_score(y_test, y_pred)
recall =            recall_score(y_test, y_pred)
simple_ann_result = pd.DataFrame([[accuracy, f1, precision, recall]], columns=['accuracy', 'f1_score', 'precision', 'recall'])
simple_ann_result.index = ['Simple_Neural_Network']
results2 = results2.append(simple_ann_result)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [33]:
complex_ann = Sequential()
complex_ann.add(Dense(units=32,activation='relu'))
complex_ann.add(Dense(units=8,activation='relu'))
complex_ann.add(Dense(units=1,activation='sigmoid'))
complex_ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
complex_ann.fit(X_train, y_train, batch_size=32, epochs=200)

probability_pred = complex_ann.predict(X_test)
y_pred = (probability_pred > 0.4)
accuracy =          accuracy_score(y_test, y_pred)
f1 =                f1_score(y_test, y_pred)
precision =         precision_score(y_test, y_pred)
recall =            recall_score(y_test, y_pred)
complex_ann_result = pd.DataFrame([[accuracy, f1, precision, recall]], columns=['accuracy', 'f1_score', 'precision', 'recall'])
complex_ann_result.index = ['Complex_Neural_Network']
results2 = results2.append(complex_ann_result)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [34]:
results2.style.background_gradient(cmap = sns.color_palette("flare", as_cmap=True))

Unnamed: 0,accuracy,f1_score,precision,recall
Decision_Tree,0.947368,0.92562,0.903226,0.949153
Random_Forest,0.982456,0.975207,0.951613,1.0
KNN_5,0.988304,0.983333,0.967213,1.0
KNN_10,0.964912,0.947368,0.981818,0.915254
Naive_Bayes,0.94152,0.916667,0.901639,0.932203
SVM,0.953216,0.935484,0.892308,0.983051
Simple_Neural_Network,0.976608,0.966667,0.95082,0.983051
Complex_Neural_Network,0.982456,0.97479,0.966667,0.983051


Use **Sequential Feature Selection** for choose best features for KNN model 

In [35]:
df = pd.read_csv('breast-cancer.csv')
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int) 
df = df.drop('id',axis=1)
X_cols = df.columns.to_list()
X_cols.remove('diagnosis')
X = df[X_cols]
y = df['diagnosis']
scaler = StandardScaler() 
scaler.fit(X) 
X.iloc[:,:] = scaler.fit_transform(X.iloc[:,:])

sfs = SFS(KNeighborsClassifier(n_neighbors=5),
           k_features=30,
           forward=True,
           floating=False,
           scoring = 'recall',
           cv = 0)

sfs.fit(X, y)

df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
1,"(7,)",[0.8867924528301887],0.886792,"(concave points_mean,)"
2,"(7, 23)",[0.9292452830188679],0.929245,"(concave points_mean, area_worst)"
3,"(6, 7, 23)",[0.9669811320754716],0.966981,"(concavity_mean, concave points_mean, area_worst)"
4,"(6, 7, 20, 23)",[0.9716981132075472],0.971698,"(concavity_mean, concave points_mean, radius_w..."
5,"(6, 7, 17, 20, 23)",[0.9669811320754716],0.966981,"(concavity_mean, concave points_mean, concave ..."
6,"(6, 7, 17, 20, 23, 27)",[0.9622641509433962],0.962264,"(concavity_mean, concave points_mean, concave ..."
7,"(6, 7, 8, 17, 20, 23, 27)",[0.9622641509433962],0.962264,"(concavity_mean, concave points_mean, symmetry..."
8,"(6, 7, 8, 15, 17, 20, 23, 27)",[0.9622641509433962],0.962264,"(concavity_mean, concave points_mean, symmetry..."
9,"(6, 7, 8, 12, 15, 17, 20, 23, 27)",[0.9575471698113207],0.957547,"(concavity_mean, concave points_mean, symmetry..."
10,"(6, 7, 8, 12, 14, 15, 17, 20, 23, 27)",[0.9622641509433962],0.962264,"(concavity_mean, concave points_mean, symmetry..."


In [36]:
relevant_names = list(df_SFS_results['feature_names'][1])
X_rel = df[relevant_names]
y_rel = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X_rel, y_rel, test_size = 0.3,random_state=10)
scaler = StandardScaler() 
scaler.fit(X_train) 

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 
KNN = KNeighborsClassifier(n_neighbors=5)
results3 = train_model(KNN, X_train, y_train, X_test, y_test, '1')


for i in range(2,31):
    relevant_names = list(df_SFS_results['feature_names'][i])
    X_rel = df[relevant_names]
    y_rel = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X_rel, y_rel, test_size = 0.3,random_state=30)
    scaler = StandardScaler() 
    scaler.fit(X_train) 

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test) 
    KNN = KNeighborsClassifier(n_neighbors=5)
    results3 = results3.append(train_model(KNN, X_train, y_train, X_test, y_test, str(i)))

results3.style.background_gradient(cmap = sns.color_palette("flare", as_cmap=True))

Unnamed: 0,accuracy,f1_score,precision,recall
1,0.935673,0.905983,0.913793,0.898305
2,0.94152,0.919355,0.919355,0.919355
3,0.923977,0.896,0.888889,0.903226
4,0.935673,0.912,0.904762,0.919355
5,0.953216,0.935484,0.935484,0.935484
6,0.94152,0.916667,0.948276,0.887097
7,0.97076,0.957983,1.0,0.919355
8,0.97076,0.957983,1.0,0.919355
9,0.97076,0.957983,1.0,0.919355
10,0.964912,0.95,0.982759,0.919355


Show order how features were selected

In [37]:
rank = [df_SFS_results['feature_names'][1][0]]
for i in range(2,31):
    rank.append((list(set(df_SFS_results['feature_names'][i]) - set(df_SFS_results['feature_names'][i-1]))[0]))

pd.DataFrame(rank,columns=['feature'],index=[i for i in range(1,31)])

Unnamed: 0,feature
1,concave points_mean
2,area_worst
3,concavity_mean
4,radius_worst
5,concave points_se
6,concave points_worst
7,symmetry_mean
8,compactness_se
9,perimeter_se
10,smoothness_se


Use **Sequential Feature Eimination** for choose best features for KNN model 

In [38]:
df = pd.read_csv('breast-cancer.csv')
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int) 
df = df.drop('id',axis=1)
X_cols = df.columns.to_list()
X_cols.remove('diagnosis')
X = df[X_cols]
y = df['diagnosis']
scaler = StandardScaler() 
scaler.fit(X) 
X.iloc[:,:] = scaler.fit_transform(X.iloc[:,:])

sfs = SFS(KNeighborsClassifier(n_neighbors=5),
           k_features=1,
           forward=False,
           floating=False,
           scoring = 'recall',
           cv = 0)

sfs.fit(X, y)

df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
30,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0.9528301886792453],0.95283,"(radius_mean, texture_mean, perimeter_mean, ar..."
29,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0.9575471698113207],0.957547,"(radius_mean, texture_mean, perimeter_mean, ar..."
28,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...",[0.9575471698113207],0.957547,"(radius_mean, texture_mean, perimeter_mean, ar..."
27,"(0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15...",[0.9575471698113207],0.957547,"(radius_mean, texture_mean, perimeter_mean, ar..."
26,"(0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 14, 15, 1...",[0.9575471698113207],0.957547,"(radius_mean, texture_mean, perimeter_mean, ar..."
25,"(0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 14, 15, 1...",[0.9528301886792453],0.95283,"(radius_mean, texture_mean, perimeter_mean, ar..."
24,"(0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 14, 15, 1...",[0.9575471698113207],0.957547,"(radius_mean, texture_mean, perimeter_mean, ar..."
23,"(0, 1, 2, 3, 4, 6, 8, 9, 10, 12, 14, 15, 16, 1...",[0.9669811320754716],0.966981,"(radius_mean, texture_mean, perimeter_mean, ar..."
22,"(0, 1, 2, 3, 4, 6, 8, 9, 10, 12, 14, 15, 16, 1...",[0.9716981132075472],0.971698,"(radius_mean, texture_mean, perimeter_mean, ar..."
21,"(0, 1, 2, 3, 4, 6, 8, 9, 10, 12, 14, 15, 16, 1...",[0.9716981132075472],0.971698,"(radius_mean, texture_mean, perimeter_mean, ar..."


In [39]:
relevant_names = list(df_SFS_results['feature_names'][30])
X_rel = df[relevant_names]
y_rel = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X_rel, y_rel, test_size = 0.3,random_state=10)
scaler = StandardScaler() 
scaler.fit(X_train) 

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 
KNN = KNeighborsClassifier(n_neighbors=5)
results4 = train_model(KNN, X_train, y_train, X_test, y_test, '30')


for i in range(29,0,-1):
    relevant_names = list(df_SFS_results['feature_names'][i])
    X_rel = df[relevant_names]
    y_rel = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X_rel, y_rel, test_size = 0.3,random_state=10)
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    KNN = KNeighborsClassifier(n_neighbors=5)
    results4 = results4.append(train_model(KNN, X_train, y_train, X_test, y_test, str(i)))

results4.style.background_gradient(cmap = sns.color_palette("flare", as_cmap=True))

Unnamed: 0,accuracy,f1_score,precision,recall
30,0.982456,0.974359,0.982759,0.966102
29,0.982456,0.974359,0.982759,0.966102
28,0.982456,0.974359,0.982759,0.966102
27,0.982456,0.974359,0.982759,0.966102
26,0.976608,0.965517,0.982456,0.949153
25,0.976608,0.965517,0.982456,0.949153
24,0.97076,0.956522,0.982143,0.932203
23,0.97076,0.957265,0.965517,0.949153
22,0.964912,0.948276,0.964912,0.932203
21,0.964912,0.948276,0.964912,0.932203


Show order how features were rejected

In [40]:
rank = [df_SFS_results['feature_names'][1][0]]
for i in range(2,31):
    rank.append((list(set(df_SFS_results['feature_names'][i]) - set(df_SFS_results['feature_names'][i-1]))[0]))

pd.DataFrame(rank,columns=['feature'],index=[i for i in range(1,31)]).sort_index(ascending=False)

Unnamed: 0,feature
30,fractal_dimension_se
29,area_se
28,concave points_mean
27,compactness_mean
26,symmetry_worst
25,concave points_worst
24,texture_se
23,concave points_se
22,perimeter_worst
21,fractal_dimension_worst
