# Question 2: KNN
# [CM 7] Weighted KNN, Different NN Algorithms, accuracy, AUC and f-score

Importing all necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
import matplotlib.pyplot as plt

# For Iris Data
<b> Now that we have achieved best value of k, we will be using different weights and metrics to improve performance. For iris dataset k=5.</b>

In [2]:
df_iris = pd.read_csv('cleaned_data_iris.csv')
df_iris.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
71,7.795561,2.643068,6.768611,2.424502,Iris-virginica
78,6.633922,2.86558,5.464554,2.119895,Iris-virginica
101,5.564197,2.771731,3.483588,1.074754,Iris-versicolor
94,5.743375,2.987622,4.092458,1.460286,Iris-versicolor
51,4.511538,2.242837,1.25385,0.165851,Iris-setosa


## Weighted KNN
<b> Uniform weight is selected by default. An accuracy of 96.2% was achieved.</b>

### 1. For weights='distance'

In [3]:
x = df_iris.iloc[:,:-1]
y = df_iris.iloc[:,-1]

In [4]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
accuracy_kfold_iris = []
for train, test in kf.split(x):
    x1_train,x1_test,y1_train,y1_test=x.iloc[train],x.iloc[test],y.iloc[train],y.loc[test]
    kfold_knn=KNeighborsClassifier(weights='distance')
    kfold_knn.fit(x1_train,y1_train)
    accuracy_kfold_iris.append(accuracy_score(kfold_knn.predict(x1_test),y1_test))
average_accuracy=np.mean(accuracy_kfold_iris)
print("Accuracy of Iris Dataset when weights='distance':", np.round(average_accuracy*100,2))

Accuracy of Iris Dataset when weights='distance': 97.14


<b> Thus, the accuracy of the model is improved by using weights='distance'.

### 2. For metric = ['euclidean','manhattan','chebyshev']

In [5]:
kf = KFold(n_splits=5)
metric_score = []
metric = ['euclidean','manhattan','chebyshev']
for m in metric:
    accuracy_kfold_iris = []
    for train, test in kf.split(x):
        x1_train,x1_test,y1_train,y1_test = x.iloc[train],x.iloc[test],y.iloc[train],y.loc[test]
        kfold_knn = KNeighborsClassifier(weights='distance', metric=m)
        kfold_knn.fit(x1_train,y1_train)
        accuracy_kfold_iris.append(accuracy_score(kfold_knn.predict(x1_test),y1_test))
    metric_score.append(np.mean(accuracy_kfold_iris))
print("The Accuracies for Iris Data metric 'euclidean','manhattan','chebyshev' are:", metric_score)

The Accuracies for Iris Data metric 'euclidean','manhattan','chebyshev' are: [0.9714285714285713, 0.9619047619047618, 0.9714285714285713]


<b> The euclidean metric gives the highest accuracy of 97.14 %.</b>

### 3. For Algorithm = [‘ball_tree’, ‘kd_tree’, ‘brute’]

In [6]:
import time
kf = KFold(n_splits=5)
Algorithm = ['ball_tree', 'kd_tree', 'brute']
for a in Algorithm:
    accuracy_kfold_iris = []
    start_time = time.time()
    for train, test in kf.split(x):
        x1_train,x1_test,y1_train,y1_test = x.iloc[train],x.iloc[test],y.iloc[train],y.loc[test]
        kfold_knn = KNeighborsClassifier(weights='distance', metric='euclidean', algorithm=a)
        kfold_knn.fit(x1_train,y1_train)
        accuracy_kfold_iris.append(accuracy_score(kfold_knn.predict(x1_test),y1_test))
    stop_time = time.time()
    print(a,' ', 'execution time :', (stop_time-start_time), ' ', "accuracy:", np.mean(accuracy_kfold_iris))
   

ball_tree   execution time : 0.019993305206298828   accuracy: 0.9714285714285713
kd_tree   execution time : 0.016994714736938477   accuracy: 0.9714285714285713
brute   execution time : 0.01500558853149414   accuracy: 0.9714285714285713


<b> All the three algorithms produce same accuracies but the execution time is lowest for brute.</b>


### Computing Accuracy, auc and F-score for the best parameters
<b> From the above operations it is clear that the best fit parameters are k = 5, weight = 'distance', metric = 'euclidean' and algorithm = 'brute'. These parameters are used for new classification calculation. </b>

In [7]:
knn_iris = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean', algorithm='brute')

In [8]:
accuracy_score = cross_val_score(knn_iris, x, y, cv=5)

In [9]:
y_pred = cross_val_predict(knn_iris, x, y, cv=5)

### Accuracy:

In [10]:
print('Accuracy of Iris Dataset is:', np.round( np.mean(accuracy_score)*100,2),'%')

Accuracy of Iris Dataset is: 97.14 %


### F-score:

In [11]:
from sklearn.metrics import f1_score
fsc = f1_score(y,y_pred,average='micro')
print('F-score of Iris Dataset is:', fsc)

F-score of Iris Dataset is: 0.9714285714285714


### AUC:

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
encode = OneHotEncoder()
y_encode = encode.fit_transform(np.array(y).reshape(-1,1)).todense()
pred_encode = encode.fit_transform(y_pred.reshape(-1,1)).todense()
auc_iris = roc_auc_score(y_encode,pred_encode)
print("AUC for Iris Dataset is:", auc_iris)


AUC for Iris Dataset is: 0.9785714285714286


# For Heart Data
<b> Now that we have achieved best value of k, we will be using different weights and metrics to improve performance. For heart dataset k=14.</b>

In [13]:
df_heart = pd.read_csv('heart_disease_cleaned.csv')

## Weighted KNN

### 1. For weights=['uniform','distance']

In [14]:
x = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

In [15]:
weights = ['uniform','distance']
acc_weights = []
for w in weights:
    knn_heart = KNeighborsClassifier(n_neighbors=14, weights=w)
    score_heart = cross_val_score(knn_heart, x, y, cv=5)
    acc_weights.append(np.mean(score_heart))
print("Accuracy of Heart Disease Dataset when weights=['uniform','distance']:", acc_weights)

Accuracy of Heart Disease Dataset when weights=['uniform','distance']: [0.634469696969697, 0.628030303030303]


<b> Thus higher accuracy is achieved by using weights='uniform'. </b>

### 2. metric = ['euclidean','manhattan','chebyshev']

In [16]:
metric =  ['euclidean','manhattan','chebyshev']
acc_metric = []
for m in metric:
    knn_heart = KNeighborsClassifier(n_neighbors=14, weights='uniform', metric=m)
    score_heart = cross_val_score(knn_heart, x, y, cv=5)
    acc_metric.append(np.mean(score_heart))
print("The Accuracies for Heart Data metric 'euclidean','manhattan','chebyshev' are:", acc_metric)

The Accuracies for Heart Data metric 'euclidean','manhattan','chebyshev' are: [0.634469696969697, 0.6587121212121213, 0.5856060606060607]


<b> Higher accuracy is achieved by using manhattan distance. </b>

 ### 3. Algorithm = [‘ball_tree’, ‘kd_tree’, ‘brute’]

In [17]:
import time
alg_heart = ['ball_tree', 'kd_tree', 'brute']
alg_score = []
alg_time = []
for a in alg_heart:
    start_time = time.time()
    knn_heart = KNeighborsClassifier(n_neighbors=14, weights='uniform', metric='manhattan', algorithm=a)
    score_heart = cross_val_score(knn_heart,x,y,cv=5)
    stop_time=time.time()
    print(a,' ','execution time :',(stop_time-start_time),' ',"accuracy:",np.mean(score_heart))

ball_tree   execution time : 0.02507162094116211   accuracy: 0.6587121212121213
kd_tree   execution time : 0.022931337356567383   accuracy: 0.6587121212121213
brute   execution time : 0.01999521255493164   accuracy: 0.6587121212121213


<b> The accuracy remains the same but the execution time is lowest for'brute'. </b>

## Computing Accuracy, auc and F-score for the best parameters
<b> From the above operations it is clear that the best fit parameters are k = 14, weight = 'uniform', metric = 'manhattan' and algorithm = 'brute'. These parameters are used for new classification calculation. </b> 

In [18]:
knn_heart = KNeighborsClassifier(n_neighbors=14, weights='uniform', metric='manhattan', algorithm='brute')

In [19]:
accuracy_heart = cross_val_score(knn_heart, x, y, cv=5)

In [20]:
y_pred = cross_val_predict(knn_heart, x, y, cv=5)

### Accuracy:

In [21]:
print("Accuracy for Heart Disease Dataset using best fit parameters is:", np.round(np.mean(accuracy_heart)*100,2),'%')

Accuracy for Heart Disease Dataset using best fit parameters is: 65.87 %


### F-Score:

In [22]:
from sklearn.metrics import f1_score

fsc = f1_score(y, y_pred, average='micro')
print('F-score for Heart Disease Dataset is:', fsc)

F-score for Heart Disease Dataset is: 0.6585365853658537


### AUC:

In [23]:
auc_heart = roc_auc_score(y, y_pred, average='weighted')
print('AUC for Heart Disease Dataset is:', auc_heart)

AUC for Heart Disease Dataset is: 0.6602870813397128
