<a href="https://colab.research.google.com/github/Mandar-Desurkar/Machine_Learning_Lab/blob/main/Q7_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
url = 'diabetes.csv'
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, header=None, names=column_names)

# Convert columns to numeric, if any conversion issues arise, replace them with NaN
numeric_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Replace zero values with NaN in these columns
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols] = data[cols].replace(0, np.nan)

# Drop rows with missing values
data.dropna(inplace=True)

# Split the dataset into features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implementing KNN from scratch
def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2) ** 2))

def get_neighbors(train, train_labels, test_row, num_neighbors):
    distances = []
    for i, train_row in enumerate(train):
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_labels.iloc[i], dist))
    distances.sort(key=lambda x: x[1])
    neighbors = [distances[i][0] for i in range(num_neighbors)]
    return neighbors

def predict_classification(train, train_labels, test_row, num_neighbors):
    neighbors = get_neighbors(train, train_labels, test_row, num_neighbors)
    prediction = max(set(neighbors), key=neighbors.count)
    return prediction

def knn_predict(train, train_labels, test, num_neighbors):
    predictions = []
    for row in test:
        output = predict_classification(train, train_labels, row, num_neighbors)
        predictions.append(output)
    return predictions

# Make predictions using the KNN algorithm
y_pred = knn_predict(X_train_scaled, y_train, X_test_scaled, num_neighbors=3)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Experiment with different values of k and distance metrics
for k in range(1, 10):
    y_pred = knn_predict(X_train_scaled, y_train, X_test_scaled, num_neighbors=k)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with k={k}: {accuracy}")

Accuracy: 0.7215189873417721
Accuracy with k=1: 0.7088607594936709
Accuracy with k=2: 0.7215189873417721
Accuracy with k=3: 0.7215189873417721
Accuracy with k=4: 0.7468354430379747
Accuracy with k=5: 0.7468354430379747
Accuracy with k=6: 0.7468354430379747
Accuracy with k=7: 0.7341772151898734
Accuracy with k=8: 0.7468354430379747
Accuracy with k=9: 0.759493670886076
