In [None]:
import numpy as np

from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

X = data.data
y = data.target

In [None]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, 
                  train_size=0.8, 
                  test_size=0.2, 
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [None]:
data.feature_names

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
skb = SelectKBest(chi2, k=20)

In [None]:
skb.fit(X_train, y_train)

In [None]:
X_train_new = skb.transform(X_train)

In [None]:
X_train_new.shape, X_train.shape

In [None]:
skb.get_support()

In [None]:
data.feature_names[skb.get_support()]

In [None]:
data.feature_names[~skb.get_support()]

In [None]:
from sklearn import linear_model
clf = linear_model.LogisticRegression()

In [None]:
from sklearn.model_selection import StratifiedKFold

k_range = np.arange(1, 31)
scores = []
std = []

for k in k_range:

    ss = StratifiedKFold(n_splits=10, 
                         shuffle=True, 
                         random_state=2)
    score = []
    for train_index, val_index in ss.split(X_train,
                                           y_train):

        X_train2, X_val = X[train_index], X[val_index]
        y_train2, y_val = y[train_index], y[val_index]

        skb = SelectKBest(chi2, k=k)
        skb.fit(X_train2, y_train2)
        
        X_new_train2 = skb.transform(X_train2)
        X_new_val    = skb.transform(X_val)
        
        clf.fit(X_new_train2, y_train2)
        score.append( clf.score(X_new_val, y_val) )

    scores.append( np.array(score).mean() )
    std.append( np.array(score).std() )
    
scores = np.array(scores)
std = np.array(std)

In [None]:
scores

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 

plt.plot(k_range, scores)
plt.errorbar(k_range, scores, yerr=std)
plt.ylabel("accuracy")

In [None]:
plt.bar(k_range, 1-scores, yerr=[np.zeros(std.shape),
                                 std])
plt.ylabel("error rate")

In [None]:
best_k = k_range[np.argmax(scores)]
best_k

In [None]:
skb = SelectKBest(chi2, k=best_k)

In [None]:
skb.fit(X_train, y_train)

In [None]:
X_train_best = skb.transform(X_train)
X_test_best  = skb.transform(X_test)

In [None]:
clf.fit(X_train_best, y_train)

In [None]:
clf.score(X_test_best, y_test)

In [None]:
clf.fit(X_train, y_train);

In [None]:
clf.score(X_test, y_test)