In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import LocalOutlierFactor

In [2]:
class KNN():
    def __init__(self,k,met="euc"):
        self.k = k
        self.metric = met

    def fit(self,X,y):
        self.X = X
        self.y = y

    def distance(self,new_point):
        if self.metric == 'man':
            return np.sum(np.abs(self.X - new_point), axis=1)
        elif self.metric == 'cos':
            return 1 - np.dot(self.X, new_point) / (np.linalg.norm(self.X, axis=1) * np.linalg.norm(new_point))
        elif self.metric == 'mah':
            inv_cov = np.linalg.inv(np.cov(self.X.T))
            diff = self.X - new_point
            return np.sqrt(np.sum(np.dot(diff, inv_cov) * diff, axis=1))
        else:
            return np.linalg.norm(self.X - new_point, axis=1)  # Default: Euclidean

    def decision(self,np_distance,weighted=False):
        sortedDistance = np.sort(np_distance)[:self.k]
        k_neighbours_idx = np.argsort(np_distance)[:self.k]
        np_votes = self.y[k_neighbours_idx.astype(int),0]
        classes,class_freq = np.unique(np_votes,return_counts=True)
        if weighted:
            weights = 1/(sortedDistance + np.finfo(float).eps)
            normalized_weights = weights/np.linalg.norm(weights)
            highestWeight = 0
            for label in classes:
                classWeight = np.sum(normalized_weights[np_votes == label])
                if classWeight > highestWeight:
                    decision = label
        else:
            decision = classes[np.argmax(class_freq)]
        return decision

    def predict(self,new_points):
        """
        loop through each point
        Find distance
        Find top indexes
        Find votes
        Find decision
        Stopre decision
        """
        y_test = np.array([])
        for idx,new_point in enumerate(new_points):
            np_distance = self.distance(new_point)
            decision = self.decision(np_distance,weighted=False)
            y_test = np.append(y_test,decision)
        return y_test

    def evaluate(self,prediction,ground_truth):
        return (np.sum(prediction == ground_truth[:,0])/ground_truth.shape[0]) * 100


X = pd.read_csv("./Data/Diabetes_Xtrain.csv").values
y = pd.read_csv("./Data/Diabetes_Ytrain.csv").values
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
lof = LocalOutlierFactor()
outliers = lof.fit_predict(X)
X = X[outliers == 1]
y = y[outliers == 1]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)





# k = 65
# accuracy = 0
# while accuracy < 96:
#     X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.9,shuffle=True)
#     classifier = KNN(k,"cos")
#     classifier.fit(X_train,y_train)
#     y_pred = classifier.predict(X_val)
#     accuracy = classifier.evaluate(y_pred,y_val)
#     if accuracy > 95:
#         print(accuracy)

In [3]:
k = 45
classifier = KNN(k,"euc")
classifier.fit(X,y)

In [4]:
X_test = scaler.transform(pd.read_csv("./Data/Diabetes_Xtest.csv").values)
# X_test = pd.read_csv("./Data/Diabetes_Xtest.csv").values
# classifier.fit(X,y)
y_test = classifier.predict(X_test)

In [5]:
df = pd.DataFrame(y_test,columns = ["Outcome"],dtype=int)
df.to_csv("./Data/Diabetes_predictions.csv",index=False)