In [1]:
from random import seed, randrange
from csv import reader
from math import sqrt
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ttest_rel

# Custom functions for loading and preparing the dataset
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Custom KNN implementation
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return predictions

def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Load and prepare the dataset
filename = 'car.data'  # Adjust the filename/path as necessary
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_int(dataset, i)

# Evaluate custom KNN
seed(1)
n_folds = 10
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Custom KNN Scores:', scores)
print('Mean Accuracy:', sum(scores)/float(len(scores)))

# Evaluate scikit-learn KNN
X = np.array(dataset)[:, :-1]
y = np.array(dataset)[:, -1]
le = LabelEncoder()
y_encoded = le.fit_transform(y)
model = KNeighborsClassifier(n_neighbors=num_neighbors)
sklearn_scores = cross_val_score(model, X, y_encoded, cv=n_folds, scoring='accuracy') * 100
print('Scikit-learn KNN Scores:', sklearn_scores)
print('Mean Accuracy:', np.mean(sklearn_scores))

# Paired t-test
t_stat, p_value = ttest_rel(scores, sklearn_scores)
print('Paired t-test:', 't_stat=', t_stat, 'p_value=', p_value)

alpha = 0.05
print("Value of alpha:- ",alpha)
if p_value > alpha:
    print('Fails to reject H0 (Null Hypothesis)')
else:
    print('Reject H0 (Null Hypothesis)')

Custom KNN Scores: [83.72093023255815, 85.46511627906976, 86.04651162790698, 82.55813953488372, 88.95348837209302, 90.11627906976744, 81.97674418604652, 89.53488372093024, 87.79069767441861, 89.53488372093024]
Mean Accuracy: 86.56976744186048
Scikit-learn KNN Scores: [70.52023121 55.49132948 70.52023121 64.16184971 57.80346821 65.89595376
 78.03468208 75.72254335 77.3255814  73.8372093 ]
Mean Accuracy: 68.93130797150155
Paired t-test: t_stat= 6.512313774355645 p_value= 0.00010986717206592985
Value of alpha:-  0.05
Reject H0 (Null Hypothesis)


In [2]:
from random import seed, randrange
from csv import reader
from math import sqrt
from scipy.stats import ttest_rel
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import numpy as np

# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

# Locate the most similar neighbors for regression
def get_neighbors_regression(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

# Make a prediction with neighbors for regression
def predict_regression(train, test_row, num_neighbors):
    neighbors = get_neighbors_regression(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = sum(output_values) / float(len(output_values))
    return prediction

# kNN Regression Algorithm
def k_nearest_neighbors_regression(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_regression(train, row, num_neighbors)
        predictions.append(output)
    return predictions

# Mean Squared Error metric
def mean_squared_error_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return mean_error

# Evaluate an algorithm using a cross validation split for regression
def evaluate_algorithm_regression(dataset, algorithm, n_folds, num_neighbors):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, num_neighbors)
        actual = [row[-1] for row in fold]
        mse = mean_squared_error_metric(actual, predicted)
        scores.append(mse)
    return scores

# Main execution block
if __name__ == "__main__":
    seed(1)
    filename = 'car.data'  
    dataset = load_csv(filename)
    for i in range(len(dataset[0])):
        str_column_to_int(dataset, i)
    
    # Add a synthetic numerical target for regression
    for row in dataset:
        row.append(randrange(0, 101))  # Add a random value between 0 and 100
    
    # Classification
    n_folds = 10
    num_neighbors = 5
#     scores_classification = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
#     print('Custom KNN Classification Scores:', scores_classification)
#     print('Mean Accuracy:', sum(scores_classification)/float(len(scores_classification)), '%')
    
    # Regression
    scores_regression = evaluate_algorithm_regression(dataset, k_nearest_neighbors_regression, n_folds, num_neighbors)
    print('Custom KNN Regression Mean Squared Error:', scores_regression)
    print('Mean MSE:', sum(scores_regression)/float(len(scores_regression)))
    



Custom KNN Regression Mean Squared Error: [979.5388372093026, 1090.2379069767442, 1196.5306976744184, 1032.1423255813952, 984.7004651162787, 1038.9925581395348, 1116.1762790697676, 1044.5411627906979, 926.0406976744188, 1028.1920930232561]
Mean MSE: 1043.7093023255813
