In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score,precision_score,recall_score,classification_report,confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cancer_df = pd.read_csv("Prostate_Cancer.csv")

In [3]:
cancer_df.shape

(100, 10)

In [4]:
cancer_df.describe()

Unnamed: 0,id,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,16.85,18.23,96.78,702.88,0.10273,0.1267,0.19317,0.06469
std,29.011492,4.879094,5.192954,23.676089,319.710895,0.014642,0.061144,0.030785,0.008151
min,1.0,9.0,11.0,52.0,202.0,0.07,0.038,0.135,0.053
25%,25.75,12.0,14.0,82.5,476.75,0.0935,0.0805,0.172,0.059
50%,50.5,17.0,17.5,94.0,644.0,0.102,0.1185,0.19,0.063
75%,75.25,21.0,22.25,114.25,917.0,0.112,0.157,0.209,0.069
max,100.0,25.0,27.0,172.0,1878.0,0.143,0.345,0.304,0.097


In [5]:
cancer_df.head(5)

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [6]:
# Removing the id column as it has unique values and doesnot determine the cancer.
cancer_df = cancer_df.iloc[:,1:]

In [7]:
cancer_df.head(5)

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,M,23,12,151,954,0.143,0.278,0.242,0.079
1,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,M,14,16,78,386,0.07,0.284,0.26,0.097
4,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [8]:
cancer_df.isnull().sum()

diagnosis_result     0
radius               0
texture              0
perimeter            0
area                 0
smoothness           0
compactness          0
symmetry             0
fractal_dimension    0
dtype: int64

In [9]:
cancer_df['diagnosis_result'].value_counts()

M    62
B    38
Name: diagnosis_result, dtype: int64

In [10]:
cancer_df['diagnosis_result'].replace(('M', 'B'), (1, 0), inplace=True)

In [11]:
cancer_df['diagnosis_result'][:5]

0    1
1    0
2    1
3    1
4    1
Name: diagnosis_result, dtype: int64

In [12]:
def normalizefun(col):
    return ((col - min(col)) / (max(col) - min(col)))

In [13]:
cancer_df.iloc[:,1:].apply(normalizefun)

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,0.8750,0.0625,0.825000,0.448687,1.000000,0.781759,0.633136,0.590909
1,0.0000,0.1250,0.675000,0.670644,1.000000,0.133550,0.272189,0.090909
2,0.7500,1.0000,0.650000,0.597255,0.753425,0.397394,0.426036,0.159091
3,0.3125,0.3125,0.216667,0.109785,0.000000,0.801303,0.739645,1.000000
4,0.0000,0.5000,0.691667,0.653341,0.972603,0.309446,0.272189,0.136364
5,1.0000,0.8750,0.258333,0.164081,0.794521,0.429967,0.437870,0.522727
6,0.4375,0.9375,0.566667,0.500000,0.342466,0.231270,0.260355,0.090909
7,0.3750,0.4375,0.316667,0.224344,0.671233,0.413681,0.502959,0.500000
8,0.6250,0.8125,0.300000,0.189737,0.780822,0.504886,0.591716,0.477273
9,1.0000,0.0000,0.266667,0.163484,0.671233,0.657980,0.402367,0.659091


In [14]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
X = cancer_df.iloc[:,1:]
Y = cancer_df.iloc[:,0]


In [15]:
Y.head(5)

0    1
1    0
2    1
3    1
4    1
Name: diagnosis_result, dtype: int64

In [16]:
X.head(5)

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,23,12,151,954,0.143,0.278,0.242,0.079
1,9,13,133,1326,0.143,0.079,0.181,0.057
2,21,27,130,1203,0.125,0.16,0.207,0.06
3,14,16,78,386,0.07,0.284,0.26,0.097
4,9,19,135,1297,0.141,0.133,0.181,0.059


In [17]:
validation_size = 0.34
X_train, X_Validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=0)


In [18]:
X_train.shape

(66, 8)

In [19]:
X_Validation.shape

(34, 8)

In [20]:
Y_train.shape

(66,)

In [21]:
Y_validation.shape

(34,)

In [22]:
knn_model = KNeighborsClassifier(n_neighbors = 10, weights='uniform' )
knn_model.fit(X_train,Y_train)
pred = knn_model.predict(X_Validation)

In [23]:
pred.shape

(34,)

In [24]:
a = accuracy_score(Y_validation, pred)
print("Accuracy of the model is ", + a)

Accuracy of the model is  0.8529411764705882


In [25]:
 cm=confusion_matrix(Y_validation, pred)

In [26]:
cm

array([[ 9,  1],
       [ 4, 20]], dtype=int64)

In [27]:
knn_model = KNeighborsClassifier(n_neighbors = 9, weights='uniform' )
knn_model.fit(X_train,Y_train)
pred = knn_model.predict(X_Validation)

In [28]:
a = accuracy_score(Y_validation, pred)
print("Accuracy of the model is ", + a)

Accuracy of the model is  0.7941176470588235


In [29]:
 cm=confusion_matrix(Y_validation, pred)

In [30]:
cm

array([[ 6,  4],
       [ 3, 21]], dtype=int64)

In [31]:
knn_model = KNeighborsClassifier(n_neighbors = 12, weights='uniform' )
knn_model.fit(X_train,Y_train)
pred = knn_model.predict(X_Validation)
a = accuracy_score(Y_validation, pred)
print("Accuracy of the model is ", + a)

Accuracy of the model is  0.8235294117647058


In [32]:
cm=confusion_matrix(Y_validation, pred)
cm

array([[ 9,  1],
       [ 5, 19]], dtype=int64)

In [33]:
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]


# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

In [34]:
print(TPR,TNR,PPV,NPV,FPR,FNR,FDR,ACC)

0.7916666666666666 0.9 0.95 0.6428571428571429 0.1 0.20833333333333334 0.05 0.8235294117647058


In [35]:
print(classification_report(Y_validation, pred))
k_list = list(range(1,50))
cv_scores = []

# perform 10-fold cross validation
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = model_selection.cross_val_score(knn, X_train, Y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
print(pred)

              precision    recall  f1-score   support

           0       0.64      0.90      0.75        10
           1       0.95      0.79      0.86        24

   micro avg       0.82      0.82      0.82        34
   macro avg       0.80      0.85      0.81        34
weighted avg       0.86      0.82      0.83        34

[1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 0 1 0]


In [36]:
MSE = [1 - x for x in cv_scores]
best_k = k_list[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d." % best_k)

The optimal number of neighbors is 37.


In [37]:
knn_model = KNeighborsClassifier(n_neighbors = 37, weights='uniform' )
knn_model.fit(X_train,Y_train)
pred = knn_model.predict(X_Validation)
a = accuracy_score(Y_validation, pred)
print("Accuracy of the model is ", + a)

Accuracy of the model is  0.8235294117647058


In [38]:
# The accuracy of the model can be still imporved by selecting set of features or by using algorithms by decision trees and Random forests
