In [11]:
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline

##  k-nearest neighbors KNN
- use data from this link https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer


In [7]:
cancer = load_breast_cancer(as_frame=True)
cancer_x = cancer.data
cancer_y = cancer.target

In [8]:
cancer_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [12]:
#Find the best hyper-parameter for n_neighbors

#standardize the data
cancer_x_norm = StandardScaler().fit_transform(cancer_x)

#have 50 number of clusters to try
parameters = {'n_neighbors': np.arange(1,50)}

#train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(cancer_x_norm, cancer_y, test_size = 0.2, random_state = 3)

#Instantiate the knn
knn = KNeighborsClassifier()


#create the hyperparameter grid
knn_cv = GridSearchCV(knn, param_grid = parameters, cv=5);

#fit the train sets
knn_cv.fit(X_train, Y_train) 

#find the best k_neighbors
print(knn_cv.best_params_)

#find the best fitting score
print(knn_cv.best_score_)

{'n_neighbors': 4}
0.9736263736263737


In [18]:
#from the code above we know most ideal "n_neighbors" is 4 
knn = KNeighborsClassifier(n_neighbors = 4)

#train the model
knn.fit(X_train, Y_train) 

#predict on x-test to get y
y_predict = knn.predict(X_test)

#Get the accuracy score
print("Accuracy on the test set is %.2f"%knn.score(X_test, Y_test)) #Return the mean accuracy on the given test data and labels
print("Accuracy on the train set is %.2f \n"%knn.score(X_train, Y_train))

#get the classification report, y test, y test predicted
report = (classification_report(Y_test,y_predict))

print(report)

Accuracy on the test set is 0.95
Accuracy on the train set is 0.99 

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        40
           1       0.96      0.96      0.96        74

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114

