In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import pandas as pd
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/knn-algorithm-dataset/KNNAlgorithmDataset.csv


### Initial Data Exploration and Cleaning Dataset

In [2]:
df = pd.read_csv('/kaggle/input/knn-algorithm-dataset/KNNAlgorithmDataset.csv')
print(df.info())
df = df.drop('Unnamed: 32',axis=1)
print('describe output: \n', df.describe())
print('-------------------------------------')
print('columns: ', list(df.columns))
print('-------------------------------------')
print('unique diagnoses: ', df.diagnosis.unique())
print('diagnosis counts', df.diagnosis.value_counts())

df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})

print("Number of null values: ", df['diagnosis'].isnull().sum())
print("ratio of null values: ", df.diagnosis.isnull().sum()/df.diagnosis.count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

### Separating Training and Test, Predicting And Errors

In [3]:


X = df.drop('diagnosis',axis=1)
y = df['diagnosis']
print("does y have null values? ", y.isnull().sum())

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


does y have null values?  0
Accuracy:  0.9385964912280702

Confusion Matrix:
 [[71  1]
 [ 6 36]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.95        72
           1       0.97      0.86      0.91        42

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



Initially, with k = 3, and using all features, an accuracy of 93.8% is achieved. **Six** people would inaccurately be told they don't have cancer. So we want to minimize the recall for label '1'

### Optimizing the k value

Experiment with several k values to see how it affects recall

In [4]:
#fill up k values
k_values = []
for i in range(1,25,2):
    k_values.append(i)
best_recall = 0
best_k = 0
accuracy_for_best_recall = 0
for k in k_values:
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)

    recall_m = recall_score(y_test, y_pred, pos_label=1)
    accuracy = accuracy_score(y_test, y_pred)
    print('K: ', k, '      Recall: ', recall_m, '           Accuracy: ', accuracy)

    
    if recall_m > best_recall:
        best_recall = recall_m
        best_k = k
        accuracy_for_best_recall = accuracy
        
print('Best k value: ', best_k)
print('Best recall value: ', best_recall)
print('Accuracy corresponding to best recall: ', accuracy_for_best_recall)
        
    

K:  1       Recall:  0.8809523809523809            Accuracy:  0.9298245614035088
K:  3       Recall:  0.8571428571428571            Accuracy:  0.9385964912280702
K:  5       Recall:  0.9047619047619048            Accuracy:  0.956140350877193
K:  7       Recall:  0.9047619047619048            Accuracy:  0.956140350877193
K:  9       Recall:  0.8809523809523809            Accuracy:  0.9473684210526315
K:  11       Recall:  0.8809523809523809            Accuracy:  0.9473684210526315
K:  13       Recall:  0.8809523809523809            Accuracy:  0.9473684210526315
K:  15       Recall:  0.8809523809523809            Accuracy:  0.9473684210526315
K:  17       Recall:  0.8571428571428571            Accuracy:  0.9385964912280702
K:  19       Recall:  0.8571428571428571            Accuracy:  0.9385964912280702
K:  21       Recall:  0.8571428571428571            Accuracy:  0.9385964912280702
K:  23       Recall:  0.8571428571428571            Accuracy:  0.9385964912280702
Best k value:  5
Best r

So it looks like k = 5, and k = 7 give the best recall, and incidentally the best accuracy