Import necessary libraries

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

Import dataset

In [None]:
cancer = pd.read_csv('breastcancer.csv')

In [None]:
#View dataset
#Notice that in row 23 under "bare_nucleoli" column, there is a "?" in the data
cancer

In [None]:
#Check in which column/s have "?" and counts the values in each column 
cancer.iloc[:, 1:-1].apply(pd.value_counts).fillna(0) #id column and target column were excluded

In [None]:
#Replacing the question marks in the data set into zeroes
cancer.replace('?', 0, inplace=True)

In [None]:
#Check if there are still "?" under bare_nucleoli
cancer['bare_nucleoli'].value_counts()

Dividing dataset into independent and dependent variables

In [None]:
X_cancer = cancer.iloc[:, 1:-1].values
y_cancer = cancer.iloc[:, -1].values

Splitting dataset into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

Training the algorithm

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train)

Testing the algorithm

In [None]:
y_pred = knn.predict(X_test)
compare = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

compare.head()

In [None]:
print('R-squared score (training): {:.3f}'
     .format(knn.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(knn.score(X_test, y_test)))

### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

k_range = range(1,20) 
scores = [] 

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k) 
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test)) #adds calculated score to the list

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20]);

### How sensitive is k-NN classification accuracy to the train/test split proportion?

In [None]:
import numpy as np

t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]

knn = KNeighborsClassifier(n_neighbors = 5)

plt.figure()

for s in t:

    scores = []
    for i in range(1,1000): #does 1000 iterations for each train/test split
        X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size = 1-s)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_test, y_test)) #adds calculated score to the list
    plt.plot(s, np.mean(scores), 'bo') #adds the mean score of the 1000 iterations to the list

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');