In [7]:
# Reviewing the first few lines of one of the datasets to understand its structure
# Opening and reading the first few lines of the "hayes-roth.data" file to understand the data structure

file_path = 'hayes-roth.data'

with open(file_path, 'r') as file:
    lines = file.readlines()
    for line in lines[:10]:
        print(line.strip())


92,2,1,1,2,1
10,2,1,3,2,2
83,3,1,4,1,3
61,2,4,2,2,3
107,1,1,3,4,3
113,1,1,3,2,2
80,3,1,3,2,2
125,3,4,2,4,3
36,2,2,1,1,1
105,3,2,1,1,1


In [8]:
from random import seed, randrange
from csv import reader
from math import sqrt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from scipy.stats import ttest_rel

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append([float(x) for x in row])  # Convert all to float
    return dataset

# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])-1):  # Exclude the label column
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

# Normalize dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)-1):  # Exclude the label column
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

# Make a prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

# KNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return predictions

# Main execution
seed(1)
# Load and prepare data
filename = 'hayes-roth.data'
dataset = load_csv(filename)
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

# Evaluate custom KNN
n_folds = 10
num_neighbors = 5
scores_custom_knn = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Custom KNN Scores:', scores_custom_knn)
print('Custom KNN Mean Accuracy: %.3f%%' % (sum(scores_custom_knn)/float(len(scores_custom_knn))))

# Prepare dataset for Scikit-learn KNN
X = [row[:-1] for row in dataset]  # Feature columns
y = [row[-1] for row in dataset]  # Label column
model = KNeighborsClassifier(n_neighbors=num_neighbors)
cv_scores_sklearn_knn = cross_val_score(model, X, y, cv=n_folds) #10 folds cross validation using sklearn
print('Scikit-learn KNN Mean Accuracy: %.3f%%' % (cv_scores_sklearn_knn.mean() * 100))



# Paired t-test
t_statistic, p_value = ttest_rel(scores_custom_knn, cv_scores_sklearn_knn * 100)
print('Paired t-test t-statistic:', t_statistic, 'P-value:', p_value)


alpha = 0.05

print("For the value of alpha: ->", alpha)
if p_value < alpha:
    print("Reject the H0 (Null Hypothesis)")
else:
    print("Reject H0 (Null Hypothesis)")

Custom KNN Scores: [23.076923076923077, 30.76923076923077, 23.076923076923077, 23.076923076923077, 7.6923076923076925, 46.15384615384615, 53.84615384615385, 53.84615384615385, 46.15384615384615, 38.46153846153847]
Custom KNN Mean Accuracy: 34.615%
Scikit-learn KNN Mean Accuracy: 29.451%
Paired t-test t-statistic: 0.7913609329832539 P-value: 0.44908551974363264
For the value of alpha: -> 0.05
Reject H0 (Null Hypothesis)


In [9]:
#Applying Regression Predective modeling


def predict_regression(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = sum(output_values) / float(len(output_values))
    return prediction
def k_nearest_neighbors_regression(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_regression(train, row, num_neighbors)
        predictions.append(output)
    return predictions
def evaluate_algorithm(dataset, algorithm, n_folds, num_neighbors):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, num_neighbors)
        actual = [row[-1] for row in fold]
        mse = mean_squared_error(actual, predicted)
        rmse = sqrt(mse)
        scores.append(rmse)
    return scores
rmse_scores = evaluate_algorithm(dataset, k_nearest_neighbors_regression, n_folds, num_neighbors)
print('Root Mean Squared Error scores:', rmse_scores)
print('Mean Root Mean Squared Error: %.3f' % (sum(rmse_scores)/float(len(rmse_scores))))


Root Mean Squared Error scores: [0.6770978795281658, 0.6516251872168423, 0.5974303950445415, 0.7211102550927979, 0.7961445558729607, 0.7274824872330393, 0.7961445558729607, 0.6300183147521007, 0.6950373537277857, 0.6793662204867575]
Mean Root Mean Squared Error: 0.697
