KNN Algorithm
Initial Implementation in separate blocks

In [1]:
import pandas as pd
import numpy as np

from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

dataset = pd.read_csv("../data/Cancer_Data.csv")

if 'Unnamed: 32' in dataset.columns:
    dataset.drop('Unnamed: 32', axis=1, inplace=True)
dataset['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True) # B = 0, M = 1 

In [2]:
#x = dataset.drop(['diagnosis','id'],axis=1)
x = dataset.iloc[:, 2:12]
y = dataset['diagnosis']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
x

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016


In [3]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [4]:
k = sqrt(len(y_test)) #this is 13 but putting the variable gives errors

classifier = KNeighborsClassifier(n_neighbors=13, p=2, metric='euclidean')


In [5]:
classifier.fit(x_train, y_train)

In [6]:
y_pred = classifier.predict(x_test)

In [7]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.9370629370629371
0.92
0.9019607843137255
0.9108910891089109


Which attributes are better?
All, Mean, SE or Worst?

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score


def run_predictions(p1, p2):
    recall_score_list = []
    for i in range(0, 100):  
        dataset = pd.read_csv("../data/Cancer_Data.csv")

        if 'Unnamed: 32' in dataset.columns:
            dataset.drop('Unnamed: 32', axis=1, inplace=True)
        dataset['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True) # B = 0, M = 1 

        x = dataset.iloc[:, p1:p2]
        y = dataset['diagnosis']

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

        sc_x = StandardScaler()
        x_train = sc_x.fit_transform(x_train)
        x_test = sc_x.transform(x_test)

        #default k = 5
        #default metric = minkowski
        #default weights = uniform
        #defualt algorithm = auto
        classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', weights='uniform', algorithm='auto')

        classifier.fit(x_train, y_train)

        y_pred = classifier.predict(x_test)

        recall_score_list.append(recall_score(y_test, y_pred))

    print("Avg Recall Score:   ", round(100 * (sum(recall_score_list) / len(recall_score_list)), 2), "%")

print("KNN with default values is running 100 times for each of these cases")
print("============================================")
print("Using all attributes")
run_predictions(2, 32)
print("============================================")
print("Using mean attributes")
run_predictions(2, 12)
print("============================================")
print("Using standard error attributes")
run_predictions(12, 22)
print("============================================")
print("Using worst attributes")
run_predictions(22, 32)
print("============================================")

#from sklearn.model_selection import GridSearchCV

#grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
#               'weights' : ['uniform','distance'],
#               'metric' : ['minkowski','euclidean','manhattan']}



KNN with default values is running 100 times for each of these cases
Using all attributes
Avg Recall Score:    91.78 %
Using mean attributes
Avg Recall Score:    92.28 %
Using standard error attributes
Avg Recall Score:    75.72 %
Using worst attributes
Avg Recall Score:    94.03 %


We conclude the ideal attributes to base our research on are the worst attributes, since they give us the highest average recall score, using k-nn with the default hyperparameters.

What are the best hyperparameters to use?
- k number of neighbours: small odd number
- m metric: minkowski, euclidean or manhattan
- w weights: uniform or distance

In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

recall_score_list = []
print("Avg Recall Score:")
for k in [1, 3, 5, 7, 9, 11, 13, 15, 17]:
    for m in ['minkowski','euclidean','manhattan']:
        for w in ['uniform','distance']:
            for a in ['auto', 'ball_tree', 'kd_tree', 'brute']:
                for i in range(1, 100):  
                    dataset = pd.read_csv("../data/Cancer_Data.csv")

                    if 'Unnamed: 32' in dataset.columns:
                        dataset.drop('Unnamed: 32', axis=1, inplace=True)
                    dataset['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True) # B = 0, M = 1 

                    x = dataset.iloc[:, 22:32]
                    y = dataset['diagnosis']

                    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

                    sc_x = StandardScaler()
                    x_train = sc_x.fit_transform(x_train)
                    x_test = sc_x.transform(x_test)

                    classifier = KNeighborsClassifier(n_neighbors=k, metric=m, weights=w, algorithm=a)

                    classifier.fit(x_train, y_train)

                    y_pred = classifier.predict(x_test)

                    recall_score_list.append(recall_score(y_test, y_pred))
                    
                print("k =",k,"| m =", m,"| w =",w,"| a =",a, ":", round(100 * (sum(recall_score_list) / len(recall_score_list)), 2), "%")




Avg Recall Score:
k = 1 | m = minkowski | w = uniform | a = auto : 94.11 %
k = 1 | m = minkowski | w = uniform | a = ball_tree : 94.48 %
k = 1 | m = minkowski | w = uniform | a = kd_tree : 94.25 %
k = 1 | m = minkowski | w = uniform | a = brute : 94.01 %
k = 1 | m = minkowski | w = distance | a = auto : 94.15 %
k = 1 | m = minkowski | w = distance | a = ball_tree : 94.12 %
k = 1 | m = minkowski | w = distance | a = kd_tree : 94.14 %
k = 1 | m = minkowski | w = distance | a = brute : 94.03 %
k = 1 | m = euclidean | w = uniform | a = auto : 94.04 %
k = 1 | m = euclidean | w = uniform | a = ball_tree : 94.07 %
k = 1 | m = euclidean | w = uniform | a = kd_tree : 94.08 %
k = 1 | m = euclidean | w = uniform | a = brute : 94.13 %
k = 1 | m = euclidean | w = distance | a = auto : 94.09 %
k = 1 | m = euclidean | w = distance | a = ball_tree : 94.08 %
k = 1 | m = euclidean | w = distance | a = kd_tree : 94.11 %
k = 1 | m = euclidean | w = distance | a = brute : 94.06 %
k = 1 | m = manhattan | w 

As we can see, even with so many changes, the average recall score can entirely vary in a margin of less than 1%.

Using the configuration that got us the best results(k = 1 | m = minkowski | w = uniform | a = auto), we can analyse its other scores.

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

accuracy_score_list = []
precision_score_list = []
recall_score_list = []
f1_score_list = [] 
print("Avg Recall Score:")
for i in range(1, 100):  
    dataset = pd.read_csv("../data/Cancer_Data.csv")
    if 'Unnamed: 32' in dataset.columns:
        dataset.drop('Unnamed: 32', axis=1, inplace=True)
    dataset['diagnosis'].replace(['B', 'M'],[0, 1], inplace=True) # B = 0, M = 1 
    x = dataset.iloc[:, 22:32]
    y = dataset['diagnosis']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    sc_x = StandardScaler()
    x_train = sc_x.fit_transform(x_train)
    x_test = sc_x.transform(x_test)
    classifier = KNeighborsClassifier(n_neighbors=1, metric='minkowski', weights='uniform', algorithm='auto')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    accuracy_score_list.append(accuracy_score(y_test, y_pred))
    precision_score_list.append(precision_score(y_test, y_pred))
    recall_score_list.append(recall_score(y_test, y_pred))
    f1_score_list.append(f1_score(y_test, y_pred))

print("Avg Accuracy Score: ", round(100 * (sum(accuracy_score_list) / len(accuracy_score_list)), 2), "%")
print("Avg Precision Score:", round(100 * (sum(precision_score_list) / len(precision_score_list)), 2), "%")
print("Avg Recall Score:   ", round(100 * (sum(recall_score_list) / len(recall_score_list)), 2), "%")
print("Avg F1 Score:       ", round(100 * (sum(f1_score_list) / len(f1_score_list)), 2), "%")

Avg Recall Score:
Avg Accuracy Score:  95.9 %
Avg Precision Score: 94.81 %
Avg Recall Score:    94.13 %
Avg F1 Score:        94.4 %
