In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
# Import the usual libraries
import matplotlib.pyplot as plt # plotting utilities 
%matplotlib inline
import numpy as np 
import pandas as pd  # To read in the dataset we will use the Panda's library
df = pd.read_csv('iris.csv', header=None, names = ["sepal length[cm]","sepal width[cm]","petal length[cm]", "petal width", "label"])

# Next we observe the first 5 rows of the data to ensure everything was read correctly
df.head()

In [None]:
df['label'] = df.label.map({'Iris-setosa': 0,
              'Iris-versicolor': 1,
              'Iris-virginica': 2})

In [None]:
names = ["sepal length[cm]","sepal width[cm]","petal length[cm]", "petal width"]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df[names],df['label'], random_state=0)

X_train=df_X_train.to_numpy()
X_test=df_X_test.to_numpy()
y_train=df_y_train.to_numpy()
y_test=df_y_test.to_numpy()

#Looking at the train/test split
print("The number of training examples: ", X_train.shape[0])
print("The number of test exampels: ", X_test.shape[0])

#print("The first four training labels")
#print(y_train[0:4])

#print("The first four iris' measurements")
#print(X_test[0:4])

In [None]:
def distance(A,B):
    difference = (A-B)**2
    return np.sqrt(difference.sum())

def get_closest_label(D,k):
    k_neighbors=D[:k]
    if k == 1:
        return D[0][0]
    else :
        f1=0
        f2=0
        f3=0
        for j in range(k):
            if(k_neighbors[j][1]==0):
                    f1+=1
            elif(k_neighbors[j][1]==1):
                    f2+=1
            else:
                f3+=1;
            if (f1> f2) & (f1>f3):
                most_frequent_label=0
            elif (f2>f1) & (f2>f3):
                most_frequent_label=1
            else:
                most_frequent_label=2
    return most_frequent_label
    
    

In [None]:
def predict(X_train,X_test,y_train,k):
    train_rows,train_columns = X_train.shape
    test_rows,test_columns = X_test.shape
    predicted_labels = np.empty(shape=[0,X_test.shape[0]])
    for i in range(test_rows):
        distance_array = []
        for j in range(train_rows):
            #distance_array.append((distance(X_test[i], X_train[j]), y_train[j]))
            distance_array.append((distance(X_test[i], X_train[j]), y_train[j]))
        distance_array = sorted(distance_array, key=lambda x: (x[0]))
        predicted_labels=np.append(predicted_labels,get_closest_label(distance_array,k))
    return predicted_labels
    
    
def accuracy(actual_y,predicted_y):
    correct_prediction=0
    for i in range(np.size(actual_y)):
        if(actual_y[i]==predicted_y[i]):
            correct_prediction+=1
    total_prediction=np.size(actual_y)
    accuracy=(correct_prediction/total_prediction)*100
    return accuracy    
    
    

In [None]:
result=predict(X_train,X_test,y_train,2)
print("accuracy for basic knn is:",accuracy(y_test,result),"%")

# Extension : - inverse distance weighted voting

In [None]:
def weighted_voting(D,k):
    weights = np.zeros(k)
    sums=0
    for i in range(k):
        weights[i] += 1.0 / D[i][0]
        if np.isinf(weights[i]):
            weights[i]=0
        else:
            weights[i]= float(weights[i])
        sums += weights[i]
    weights /= sums
    return weights

In [None]:
def votes(D,w,k):
    votes = np.zeros(3, dtype=np.float32)
    k_neighbor= D[:k]
    for i in range(k):
        predc = k_neighbor[i][1]
        #print("predc:",predc)
        votes[predc] += w[i] * 1.0
    return  np.argmax(votes)

In [None]:
def predict_extension(X_train,X_test,y_train,k):
    test_rows,test_columns = X_test.shape
    train_rows,train_columns = X_train.shape
    predicted_labels = np.empty(shape=[0,X_test.shape[0]])
    for i in range(test_rows):
        distance_array = []
        for j in range(train_rows):
            distance_array.append((distance(X_test[i], X_train[j]), y_train[j]))
        distance_array = sorted(distance_array, key=lambda x: (x[0]))
        weights = weighted_voting(distance_array,k)
        label = votes(distance_array,weights,k)
        predicted_labels=np.append(predicted_labels,label)
    return predicted_labels


result=predict_extension(X_train,X_test,y_train,2)
print("accuracy for knn with weighted inverse is:",accuracy(y_test,result),"%")


# Dataset 2 - 

In [None]:
breast_cancer_data = load_breast_cancer()
X_new = breast_cancer_data.data
y_new = breast_cancer_data.target

# split data 

X_train_new,X_test_new,Y_train_new,Y_test_new = train_test_split(X_new,y_new,test_size = 0.3, random_state =31)

#running basic knn algorithm 
result=predict(X_train_new,X_test_new,Y_train_new,2)
print("accuracy for basic knn is:",accuracy(Y_test_new,result),"%")

#running knn extension
result=predict_extension(X_train_new,X_test_new,Y_train_new,2)
print("accuracy for Extension KNN is:",accuracy(Y_test_new,result),"%")

