In [4]:
import numpy as np
import statistics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

KNN_Classifier Custom Class

In [5]:
class KNN_Classifier():
    def __init__(self, distance_metric):
        self.distance_metric = distance_metric

    def get_distance_metric(self, train_data_point, test_data_point):  # Compute distance between a training and test data point
        if self.distance_metric == 'euclidean':
            dist = 0
            for i in range(len(train_data_point) - 1):
                dist += (train_data_point[i] - test_data_point[i]) ** 2
            return np.sqrt(dist)  # Return Euclidean distance

        if self.distance_metric == 'manhattan':
            dist = 0
            for i in range(len(train_data_point) - 1):
                dist += abs(train_data_point[i] - test_data_point[i])
            return dist  # Return Manhattan distance

    def nearest_neighbors(self, X_train, test_data, k):  # Find the k-nearest neighbors based on distance
        distance_list = []
        for training_data in X_train:
            distance = self.get_distance_metric(training_data, test_data)
            distance_list.append((training_data, distance))

        distance_list.sort(key=lambda x: x[1])  # Sort by distance

        # Select the k closest neighbors
        neighbors_list = [distance_list[j][0] for j in range(k)]
        return neighbors_list

    def predict(self, X_train, test_data, k):  # Predict the class label based on k-nearest neighbors
        neighbors = self.nearest_neighbors(X_train, test_data, k)

        label = []
        for data in neighbors:
            label.append(data[-1])

        return statistics.mode(label)  # Return the most common label


In [6]:
df = pd.read_csv('/content/diabetes.csv')

In [7]:
print(df)

     Index  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0      0.0            6      148             72             35        0  33.6   
1      1.0            1       85             66             29        0  26.6   
2      2.0            8      183             64              0        0  23.3   
3      3.0            1       89             66             23       94  28.1   
4      4.0            0      137             40             35      168  43.1   
..     ...          ...      ...            ...            ...      ...   ...   
763    NaN           10      101             76             48      180  32.9   
764    NaN            2      122             70             27        0  36.8   
765    NaN            5      121             72             23      112  26.2   
766    NaN            1      126             60              0        0  30.1   
767    NaN            1       93             70             31        0  30.4   

     DiabetesPedigreeFuncti

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Index                     10 non-null     float64
 1   Pregnancies               768 non-null    int64  
 2   Glucose                   768 non-null    int64  
 3   BloodPressure             768 non-null    int64  
 4   SkinThickness             768 non-null    int64  
 5   Insulin                   768 non-null    int64  
 6   BMI                       768 non-null    float64
 7   DiabetesPedigreeFunction  768 non-null    float64
 8   Age                       768 non-null    int64  
 9   Outcome                   768 non-null    int64  
 10  Outcome.1                 27 non-null     float64
dtypes: float64(4), int64(7)
memory usage: 66.1 KB


In [9]:
df.isnull().sum()


Unnamed: 0,0
Index,758
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [10]:
dataframe=df.drop(columns=['Index','Outcome.1'],axis=1)


In [11]:
df['Outcome'].value_counts()  # 0 indicates non-diabetic, 1 indicates diabetic


Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [12]:
X = dataframe.drop(columns='Outcome', axis=1)  # Select all features (exclude 'Outcome')
Y = dataframe['Outcome']                       # Select only the target column ('Outcome')


In [13]:
print("X =",X.shape)
print("Y=",Y.shape)

X = (768, 8)
Y= (768,)


In [14]:
# Convert the Pandas DataFrame to a NumPy array
X = X.to_numpy()
Y = Y.to_numpy()

print(X)
print(Y)


[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]
[1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0
 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1
 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1
 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0
 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0
 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)  # Hyperparameters
# random_state=2 ensures control over how the data is shuffled
# stratify=Y ensures the split preserves the original class ratio, maintaining the same proportion of Class 0 and Class 1 in both training and test sets.


In [16]:
print("Values of the X =",X_train.shape,X_test.shape)

Values of the X = (614, 8) (154, 8)


In [17]:
print("Values of the Y =",Y_train.shape,Y_test.shape)

Values of the Y = (614,) (154,)


In [18]:
X_train = np.insert(X_train, 8, Y_train, axis=1)  # Add target column to X_train
print(X_train.shape)

# Test data is compared to the full training set to find neighbors.


(614, 9)


In [19]:
classifier =KNN_Classifier('euclidean')

In [20]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test[0]:", X_test[0].shape)

# Test the classifier on a single sample to check if the model works correctly
prediction = classifier.predict(X_train, X_test[0], k=5)

print("Test sample from X_test:", X_test[0])
print("Predicted class:", prediction)


Shape of X_train: (614, 9)
Shape of X_test[0]: (8,)
Test sample from X_test: [  3.    106.     72.      0.      0.     25.8     0.207  27.   ]
Predicted class: 0.0


In [24]:
# Predict the class for each test sample in X_test
y_pred = []
for i in range(X_test.shape[0]):
    prediction = classifier.predict(X_train, X_test[i], k=5)
    y_pred.append(prediction)

print("Result =", y_pred)
print("Length of Y_pred =", len(y_pred))


Result = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]
Length of Y_pred = 154


In [25]:
y_pred = np.array(y_pred, dtype=int)  # Convert list to NumPy array with integer type

accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7272727272727273


In [26]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

k = 5

# Train Scikit-learn KNN model
sklearn_knn = KNeighborsClassifier(n_neighbors=k)
sklearn_knn.fit(X_train, Y_train)

print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)

X_train = np.insert(X_train, 8, Y_train, axis=1)

# Make predictions using Scikit-learn KNN
sklearn_predictions = sklearn_knn.predict(X_test)

# Calculate accuracy
sklearn_accuracy = accuracy_score(Y_test, sklearn_predictions)
print("Scikit-learn KNN Accuracy:", sklearn_accuracy)

# Compare predictions between custom and Scikit-learn KNN
differences = np.abs(y_pred - sklearn_predictions)
average_difference = np.mean(differences)

print("Average Difference Between Predictions:", average_difference)


X_train shape: (614, 8)
Y_train shape: (614,)
Scikit-learn KNN Accuracy: 0.7272727272727273
Average Difference Between Predictions: 0.0
