In [1]:
import numpy as np

import matplotlib.pyplot as plt

import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [2]:
data = load_breast_cancer()

In [3]:
class_names = list(data.target_names)
class_names

['malignant', 'benign']

In [4]:
X = data.data
Y = data.target

print("X", X.shape)
print("Y", Y.shape)

X (569, 30)
Y (569,)


Split the dataset into a training set and a test set using cross validation method.

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7720)

print("[train] X", X_train.shape)
print("[train] Y", Y_train.shape)
print("[test] X", X_test.shape)
print("[test] Y", Y_test.shape)

[train] X (455, 30)
[train] Y (455,)
[test] X (114, 30)
[test] Y (114,)


In [6]:
knn = KNeighborsClassifier(n_neighbors=5)

In [7]:
knn.fit(X_train, Y_train)

KNeighborsClassifier()

In [8]:
Y_pred = knn.predict(X_test)

In [9]:
np.sum(Y_pred == Y_test) / len(Y_test)

0.9122807017543859

In [10]:
confusion_matrix(Y_test, Y_pred)

array([[39,  6],
       [ 4, 65]], dtype=int64)

In [11]:
from sklearn.feature_selection import SelectKBest, chi2

kbest = SelectKBest(chi2, k=2)
X_train_new = kbest.fit_transform(X_train, Y_train)
X_test_new = kbest.transform(X_test)

In [12]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_new, Y_train)

KNeighborsClassifier()

In [13]:
X_train_new.shape, X_test_new.shape

((455, 2), (114, 2))

In [14]:
knn.predict(X_test_new)

array([1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0])

In [15]:
np.sum(knn.predict(X_test_new) == Y_test) / len(Y_test)

0.9210526315789473