In [None]:
import numpy as np
import pandas as pd
import uuid
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report, make_scorer
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

# KNN Custom implementation

In [None]:
class KNN:
    def euclidean(self, x1, x2):
        """
        Compute the Euclidean distance between two points.

        Parameters:
        x1 : numpy.ndarray, shape (n_features,)
            The first point.
        x2 : numpy.ndarray, shape (n_features,)
            The second point.

        Returns:
        (float) :The Euclidean distance between x1 and x2.
        """
        return np.sqrt(np.sum(np.square(x1 - x2)))

    def manhattan(self, x1, x2):
        """
        Compute the Manhattan distance between two points.

        Parameters:
        x1 : numpy.ndarray, shape (n_features,)
            The first point.
        x2 : numpy.ndarray, shape (n_features,)
            The second point.

        Returns:
        (float) : The Manhattan distance between x1 and x2.
        """
        return np.sum(np.abs(x1 - x2))
    
    def __init__(self, n_neighbors=5, distance_metric='euclidean'):
        """
        Constructor for KNN class.

        Parameters:
        n_neighbors (int): Number of nearest neighbors to consider for prediction.
        distance_metric (str): Distance metric to use for computing distances between samples.
                               Possible values are 'euclidean' and 'manhattan'. Default is 'euclidean'.
        """
        self.n_neighbors = n_neighbors
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """
        Method to fit the KNN model to training data.

        Parameters:
        X (numpy.ndarray): Training data of shape (n_samples, n_features).
        y (numpy.ndarray): Target values of shape (n_samples,).
        """
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """
        Method to predict target values for given test data using the KNN model.

        Parameters:
        X (numpy.ndarray): Test data of shape (n_samples, n_features).

        Returns:
        y_pred (numpy.ndarray): Predicted target values of shape (n_samples,).
        """
        y_pred = np.zeros(X.shape[0])
        for i, test_pt in enumerate(X):
            if self.distance_metric == 'euclidean':
                # Compute Euclidean distances
                distances = np.array([self.euclidean(test_pt, train_pt) for train_pt in self.X_train])
            elif self.distance_metric == 'manhattan':
                # Compute Manhattan distances
                distances = np.array([self.manhattan(test_pt, train_pt) for train_pt in self.X_train])
            else:
                raise ValueError("Invalid distance metric specified. Please choose either 'euclidean' or 'manhattan'.")

            # Get indices of k-nearest neighbors
            idx = np.argsort(distances)[:self.n_neighbors]

            # Get labels of k-nearest neighbors
            labels = self.y_train[idx]

            # Predict label with majority vote
            y_pred[i] = np.bincount(labels).argmax()
        
        return y_pred


# Implementation Correctness - using artificial dataset

In [None]:
#Implementation Correctness Report - using artificial dataset
df = pd.read_csv('implementation_correctness_dataset.csv')

X = df[['Feature 1', 'Feature 2']]
y = df['Class/Cluster']
testdata = {'Feature 1': [1.4], 'Feature 2': [3]}
X_test = pd.DataFrame(testdata)
test_point=X_test.values[0]

In [None]:
#Implementation Correctness Report - For Euclidean
knn = KNN(n_neighbors=3, distance_metric='euclidean')
knn.fit(X.values, y.values)

y_pred = knn.predict(X_test.values)
print('Using Euclidean distance, the test point is classified as class:', y_pred[0])

# scatterplot of dataset
plt.scatter(X.values[:, 0], X.values[:, 1], c=y)
# scatterplot of test point
plt.scatter(test_point[0], test_point[1], marker='x', color='black', label='Test Point')
# scatterplot of 3 closest neighbors
idx = np.argpartition(np.sqrt(((X - test_point)**2).sum(axis=1)), 3)[:3]
plt.scatter(X.values[idx, 0], X.values[idx, 1], marker='o', color='red', facecolor='none', label='3 Closest Neighbors')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Scatter plot of the test point, its 3 nearest neighbors from dataset - using Euclidean dist')
plt.legend()
plt.show()

In [None]:
#Implementation Correctness Report - For Manhattan
knn2 = KNN(n_neighbors=3, distance_metric='manhattan')
knn2.fit(X.values, y.values)
y_pred = knn2.predict(X_test.values)
print('Using Manhattan distance, the test point is classified as class:', y_pred[0])

# scatterplot of dataset
plt.scatter(X.values[:, 0], X.values[:, 1], c=y)
# scatterplot of test point
plt.scatter(test_point[0], test_point[1], marker='x', color='black', label='Test Point')
# scatterplot of 3 closest neighbors

idx = np.argpartition(np.abs(X - test_point).sum(axis=1), 3)[:3]
plt.scatter(X.values[idx, 0], X.values[idx, 1], marker='o', color='red', facecolor='none', label='3 Closest Neighbors')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Scatter plot of the test point, its 3 nearest neighbors from dataset - using manhattan dist')
plt.legend()
plt.show()

# Data Load, Preprocessing, training and testing split

In [None]:
#Data Load and Cleanup, training and testing split
dataset = df = pd.read_csv("./data.csv")
df.drop('Unnamed: 32', axis=1, inplace=True)
df.drop('id', axis=1, inplace=True)
print(df['diagnosis'].value_counts())
df["diagnosis"] = [1 if i.strip() == "M" else 0 for i in df.diagnosis]
X=df.drop('diagnosis', axis=1, inplace=False)
y=df["diagnosis"]
scaler = StandardScaler()

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30)
X_scaled = scaler.fit_transform(X)

# Vanilla KNN - Original Features for both distance metrics

In [None]:
#Vanilla KNN - using methods implemented from Scratch
distances=['euclidean','manhattan']
knnvalues=[]
knnprecision=[]
knnrecall=[]
knnf1=[]

for i, dtype in enumerate(distances):
    knn = KNN(n_neighbors=10, distance_metric=dtype)
    knn.fit(X_train.values, y_train.values)
    knnvalues.append(accuracy_score(y_test,knn.predict(X_test.values))*100)
    y_pred = knn.predict(X_test.values)
    knnprecision.append(precision_score(y_test.values, y_pred)*100)
    knnrecall.append(recall_score(y_test.values, y_pred)*100)
    knnf1.append(f1_score(y_test.values, y_pred)*100)
    
    print("confusion matrix for method:"+dtype)
    print(confusion_matrix(y_test, y_pred))
    print("\n")
    print("classification report for method:"+dtype)
    print(classification_report(y_test, y_pred))
    print("\n")

# KNN for AutoEncoder based feature representations i.e 5% of orginial number of features and 20% of original number of features

In [None]:
#AutoEncoder Based Feature Representation
#Configuration for autoencoder
df_features = df.iloc[:,2:]
n_features = df_features.shape[1]
n_encoder1 = 500
n_encoder2 = 300
n_decoder2 = 300
n_decoder1 = 500
n_bottleneck_1 = int(math.ceil(n_features * 0.05))
n_bottleneck_2 = int(math.ceil(n_features * 0.2))
print("Bottleneck size 1: ",n_bottleneck_1)
print("Bottleneck size 2: ",n_bottleneck_2)

In [None]:
#Configuration for autoencoder 5% of features
reg5 = MLPRegressor(hidden_layer_sizes = (n_encoder1, n_encoder2, n_bottleneck_1, n_decoder2, n_decoder1),  
                    solver = 'adam',
                    activation='relu',
                    learning_rate_init = 0.001, 
                    max_iter = 1000,  
                    verbose = False)
reg5.fit(X_train.values, X_train.values)
hidden_layer = reg5.hidden_layer_sizes[1]
X_train_bottleneck_5 = reg5.predict(X_train)[:, :hidden_layer]
X_test_bottleneck_5 = reg5.predict(X_test)[:, :hidden_layer]

In [None]:
#KNN for autoEncoder 5% features- using methods implemented from Scratch
distances=['euclidean','manhattan']
ae5accuracy=[]
ae5precision=[]
ae5recall=[]
ae5f1=[]

    
for i, dtype in enumerate(distances):
    knn = KNN(n_neighbors=10, distance_metric=dtype)
    knn.fit(X_train_bottleneck_5, y_train.values)
    y_pred = knn.predict(X_test_bottleneck_5)
    ae5accuracy.append(accuracy_score(y_test,y_pred)*100)
    ae5precision.append(precision_score(y_test.values, y_pred)*100)
    ae5recall.append(recall_score(y_test.values, y_pred)*100)
    ae5f1.append(f1_score(y_test.values, y_pred)*100)
    print("confusion matrix for method:"+dtype)
    print(confusion_matrix(y_test, y_pred))
    print("\n")
    print("classification report for method:"+dtype)
    print(classification_report(y_test, y_pred))
    print("\n")

In [None]:
#Configuration for autoencoder 20% of features
reg20 = MLPRegressor(hidden_layer_sizes = (n_encoder1, n_encoder2, n_bottleneck_2, n_decoder2, n_decoder1),  
                    solver = 'adam',
                    activation='relu',
                    learning_rate_init = 0.001, 
                    max_iter = 1000,  
                    verbose = False)
reg20.fit(X_train.values, X_train.values)
hidden_layer = reg20.hidden_layer_sizes[1]
X_train_bottleneck_20 = reg20.predict(X_train)[:, :hidden_layer]
X_test_bottleneck_20 = reg20.predict(X_test)[:, :hidden_layer]

In [None]:
#KNN for autoEncoder 20% features- using methods implemented from Scratchdistances=['euclidean','manhattan']
ae20accuracy=[]
ae20precision=[]
ae20recall=[]
ae20f1=[]

for i, dtype in enumerate(distances):
    knn = KNN(n_neighbors=10, distance_metric=dtype)
    knn.fit(X_train_bottleneck_20, y_train.values)
    y_pred = knn.predict(X_test_bottleneck_20)
    ae20accuracy.append(accuracy_score(y_test.values, y_pred)*100)
    ae20precision.append(precision_score(y_test.values, y_pred)*100)
    ae20recall.append(recall_score(y_test.values, y_pred)*100)
    ae20f1.append(f1_score(y_test.values, y_pred)*100)
    print("confusion matrix for method:"+dtype)
    print(confusion_matrix(y_test, y_pred))
    print("\n")
    print("classification report for method:"+dtype)
    print(classification_report(y_test, y_pred))
    print("\n")

# KNN for SVD low and SVD high feature representation

In [None]:
#Singular Values
svd = TruncatedSVD(n_components=30)
svd.fit(X)
plt.plot(svd.singular_values_)
plt.xlabel("Rank, k")
plt.ylabel("Singular Values $S_k$")
plt.title("SVD Rank vs Singular Values")
#plt.savefig("svdknn")
plt.show()

In [None]:
svdlvalues=[]
svdlprecision=[]
svdlrecall=[]
svdlf1=[]

distances=['euclidean','manhattan']

#SVD Low
svd_low = TruncatedSVD(n_components=1)
X_train_reduced_svd_low = svd_low.fit_transform(X_train, y_train)
X_test_reduced_svd_low = svd_low.transform(X_test)
print("Computing KNN From Scratch for SVD-Low")
for i, (dtype) in enumerate(distances):
    knn = KNN(n_neighbors = 10, distance_metric = dtype)
    knn.fit(X_train_reduced_svd_low, y_train.values)
    svdlvalues.append(accuracy_score(y_test,knn.predict(X_test_reduced_svd_low))*100)
    y_pred = knn.predict(X_test_reduced_svd_low)
    svdlprecision.append(precision_score(y_test.values, y_pred)*100)
    svdlrecall.append(recall_score(y_test.values, y_pred)*100)
    svdlf1.append(f1_score(y_test.values, y_pred)*100)
    print("confusion matrix for method:"+dtype)
    print(confusion_matrix(y_test, y_pred))
    print("\n")
    print("classification report for method:"+dtype)
    print(classification_report(y_test, y_pred))
    print("\n")

In [None]:
svdhvalues=[]
svdhprecision=[]
svdhrecall=[]
svdhf1=[]

distances=['euclidean','manhattan']

print("Computing KNN From Scratch for SVD-High")
svd_high = TruncatedSVD(n_components=3)
X_train_reduced_svd_high = svd_high.fit_transform(X_train, y_train)
X_test_reduced_svd_high = svd_high.transform(X_test)
for i, (dtype) in enumerate(distances):
    knn = KNN(n_neighbors = 10, distance_metric = dtype)
    knn.fit(X_train_reduced_svd_high, y_train.values)
    svdhvalues.append(accuracy_score(y_test,knn.predict(X_test_reduced_svd_high))*100)
    y_pred = knn.predict(X_test_reduced_svd_high)
    svdhprecision.append(precision_score(y_test.values, y_pred)*100)
    svdhrecall.append(recall_score(y_test.values, y_pred)*100)
    svdhf1.append(f1_score(y_test.values, y_pred)*100)
    print("confusion matrix for method:"+dtype)
    print(confusion_matrix(y_test, y_pred))
    print("\n")
    print("classification report for method:"+dtype)
    print(classification_report(y_test, y_pred))
    print("\n")

# Comparison of various models (Combination of distance metrics, feature representations)

In [None]:
Xvals = ['Euclidean','Manhattan']
X_axis = np.arange(len(Xvals))

plt.bar(X_axis - 0.2, knnvalues, 0.4, label = 'No Reduction')
plt.bar(X_axis - 0.1, svdlvalues, 0.4, label = 'SVD-Low')
plt.bar(X_axis + 0.0, svdhvalues, 0.4, label = 'SVD-High')
plt.bar(X_axis + 0.1, ae5accuracy, 0.4, label = 'MLP 5% of org features')
plt.bar(X_axis + 0.2, ae20accuracy, 0.4, label = 'MLP 20% of org features')

print(knnvalues)
print(svdlvalues)
print(svdhvalues)
print(ae5accuracy)
print(ae20accuracy)


plt.xticks(X_axis, Xvals)
plt.xlabel("Distances")
plt.ylabel("Accuracy")
plt.title("Accuracy score for different dimensionality reduction methods, k=10")
plt.ylim([70,100])
plt.legend()
plt.show()

plt.bar(X_axis - 0.2, knnprecision, 0.4, label = 'No Reduction')
plt.bar(X_axis - 0.1, svdlprecision, 0.4, label = 'SVD-Low')
plt.bar(X_axis + 0.0, svdhprecision, 0.4, label = 'SVD-High')
plt.bar(X_axis + 0.1, ae5precision, 0.4, label = 'MLP 5% of org features')
plt.bar(X_axis + 0.2, ae20precision, 0.4, label = 'MLP 20% of org features')

print(knnprecision)
print(svdlprecision)
print(svdhprecision)
print(ae5precision)
print(ae20precision)

plt.xticks(X_axis, Xvals)
plt.xlabel("Distances")
plt.ylabel("Precision")
plt.title("Precision for different dimensionality reduction methods, k=10")
plt.ylim([70,100])
plt.legend()
#plt.savefig("PrecisionComparison")
plt.show()

plt.bar(X_axis - 0.2, knnrecall, 0.4, label = 'No Reduction')
plt.bar(X_axis - 0.1, svdlrecall, 0.4, label = 'SVD-Low')
plt.bar(X_axis + 0.0, svdhrecall, 0.4, label = 'SVD-High')
plt.bar(X_axis + 0.1, ae5recall, 0.4, label = 'MLP 5% of org features')
plt.bar(X_axis + 0.2, ae20recall, 0.4, label = 'MLP 20% of org features')

print(knnrecall)
print(svdlrecall)
print(svdhrecall)
print(ae5recall)
print(ae20recall)

plt.xticks(X_axis, Xvals)
plt.xlabel("Distances")
plt.ylabel("Recall")
plt.title("Recall for different dimensionality reduction methods, k=10")
plt.ylim([70,100])
plt.legend()
#plt.savefig("RecallComparison")
plt.show()

plt.bar(X_axis - 0.2, knnf1, 0.4, label = 'No Reduction')
plt.bar(X_axis - 0.1, svdlf1, 0.4, label = 'SVD-Low')
plt.bar(X_axis + 0.0, svdhf1, 0.4, label = 'SVD-High')
plt.bar(X_axis + 0.1, ae5f1, 0.4, label = 'MLP 5% of org features')
plt.bar(X_axis + 0.2, ae20f1, 0.4, label = 'MLP 20% of org features')


print(knnf1)
print(svdlf1)
print(svdhf1)
print(ae5f1)
print(ae20f1)

plt.xticks(X_axis, Xvals)
plt.xlabel("Distances")
plt.ylabel("F1 Score")
plt.title("F1 score for different dimensionality reduction methods, k=10")
plt.ylim([70,100])
plt.legend()
#plt.savefig("f1Comparison")
plt.show()

# K-Fold Cross validation of various models

In [None]:
def k_fold_cross_validation(data, labels, k, knn):
    """
    Performs k-fold cross-validation on the KNN model.

    Parameters:
    data (numpy array): The data to be used for cross-validation.
    labels (numpy array): The labels corresponding to the data.
    k (int): The number of folds to use.
    knn (object): An instance of your KNN implementation.

    Returns:
    float: The average accuracy across all folds.
    float: The standard deviation of the accuracy across all folds.
    float: The average precision across all folds.
    float: The standard deviation of the precision across all folds.
    float: The average recall across all folds.
    float: The standard deviation of the recall across all folds.
    float: The average F1 score across all folds.
    float: The standard deviation of the F1 score across all folds.
    """
    fold_size = len(data) // k
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size

        test_data = data[start:end]
        test_labels = labels[start:end]

        train_data = np.concatenate([data[:start], data[end:]])
        train_labels = np.concatenate([labels[:start], labels[end:]])

        knn.fit(train_data, train_labels)
        predictions = knn.predict(test_data)

        accuracy = accuracy_score(test_labels, predictions)
        precision = precision_score(test_labels, predictions)
        recall = recall_score(test_labels, predictions)
        f1score = f1_score(test_labels, predictions)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1score)

    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)
    mean_precision = np.mean(precisions)
    std_precision = np.std(precisions)
    mean_recall = np.mean(recalls)
    std_recall = np.std(recalls)
    mean_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    return mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score


## KNN original features - K Fold Cross validation with Hyperperformance tuning

In [None]:
#Vanilla KNN
n_folds = 10
k_vals = range(1, 31)
dist_metrics = ['euclidean', 'manhattan']

maxF1=0
maxRecall=0
maxPrecision=0
maxk=0
maxDmetric='nil'
maxFeatureRep='nil'
recallOfMaxE=0
precisionOfMax=0
accuracyOfMax=0
for metric in dist_metrics:
    # initialize lists to store results
    accuracy_means = []
    accuracy_stds = []
    precision_means = []
    precision_stds = []
    recall_means = []
    recall_stds = []
    f1_score_means = []
    f1_score_stds = []
    
    for k in k_vals:
        # initialize KNN model
        knn = KNN(n_neighbors=k, distance_metric=metric)
        
        mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = k_fold_cross_validation(X_scaled, y.values, 10, knn)

        # calculate mean and standard deviation of fold results
        accuracy_means.append(mean_accuracy)
        accuracy_stds.append(std_accuracy)
        precision_means.append(mean_precision)
        precision_stds.append(std_precision)
        recall_means.append(mean_recall)
        recall_stds.append(std_recall)
        f1_score_means.append(mean_f1_score)
        f1_score_stds.append(std_f1_score)
        
    # plot results
    #fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, accuracy_means, yerr=accuracy_stds, capsize=10)
    plt.ylabel('Accuracy Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig(str(uuid.uuid4()))
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, precision_means, yerr=precision_stds, capsize=10)
    plt.ylabel('Precision Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig(str(uuid.uuid4()))
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, recall_means, yerr=recall_stds, capsize=10)
    plt.ylabel('Recall Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig(str(uuid.uuid4()))
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, f1_score_means, yerr=f1_score_stds, capsize=10)
    plt.ylabel('F1 Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig(str(uuid.uuid4()))
    plt.show()
    
    if(max(f1_score_means)>maxF1):
        maxF1=max(f1_score_means)
        maxk=f1_score_means.index(max(f1_score_means))+1
        maxDmetric=metric
        maxFeatureRep='Vanilla'
        recallOfMaxE=recall_means[f1_score_means.index(max(f1_score_means))]
        precisionOfMax=precision_means[f1_score_means.index(max(f1_score_means))]
        accuracyOfMax=accuracy_means[f1_score_means.index(max(f1_score_means))]
    

## KNN SVD Low - K Fold Cross validation with Hyperperformance tuning

In [None]:
#KNN with SVD low
n_folds = 10
k_vals = range(1, 31)
dist_metrics = ['euclidean', 'manhattan']
svd_low = TruncatedSVD(n_components=1)
X_svd_low = svd_low.fit_transform(X,y)

for metric in dist_metrics:
    # initialize lists to store results
    accuracy_means = []
    accuracy_stds = []
    precision_means = []
    precision_stds = []
    recall_means = []
    recall_stds = []
    f1_score_means = []
    f1_score_stds = []
    
    for k in k_vals:
        # initialize KNN model
        knn = KNN(n_neighbors=k, distance_metric=metric)
        
        mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = k_fold_cross_validation(X_svd_low, y.values, 10, knn)

        # calculate mean and standard deviation of fold results
        accuracy_means.append(mean_accuracy)
        accuracy_stds.append(std_accuracy)
        precision_means.append(mean_precision)
        precision_stds.append(std_precision)
        recall_means.append(mean_recall)
        recall_stds.append(std_recall)
        f1_score_means.append(mean_f1_score)
        f1_score_stds.append(std_f1_score)
        
    # plot results
    #fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, accuracy_means, yerr=accuracy_stds, capsize=10)
    plt.ylabel('Accuracy Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("AccuracySVDLOW"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, precision_means, yerr=precision_stds, capsize=10)
    plt.ylabel('Precision Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("PrecisionSVDLOW"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, recall_means, yerr=recall_stds, capsize=10)
    plt.ylabel('Recall Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("RecallSVDLOW"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, f1_score_means, yerr=f1_score_stds, capsize=10)
    plt.ylabel('F1 Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("F1SVDLOW"+metric)
    plt.show()
    
    if(max(f1_score_means)>maxF1):
        maxF1=max(f1_score_means)
        maxk=f1_score_means.index(max(f1_score_means))+1
        maxDmetric=metric
        maxFeatureRep='SVD LOW'
        recallOfMaxE=recall_means[f1_score_means.index(max(f1_score_means))]
        precisionOfMax=precision_means[f1_score_means.index(max(f1_score_means))]
        accuracyOfMax=accuracy_means[f1_score_means.index(max(f1_score_means))]



## KNN SVD HIGH - K Fold Cross validation with Hyperperformance tuning

In [None]:
#KNN with SVD High
n_folds = 10
k_vals = range(1, 31)
dist_metrics = ['euclidean', 'manhattan']
svd_high = TruncatedSVD(n_components=3)
X_svd_high = svd_high.fit_transform(X,y)

for metric in dist_metrics:
    # initialize lists to store results
    accuracy_means = []
    accuracy_stds = []
    precision_means = []
    precision_stds = []
    recall_means = []
    recall_stds = []
    f1_score_means = []
    f1_score_stds = []
    
    for k in k_vals:
        # initialize KNN model
        knn = KNN(n_neighbors=k, distance_metric=metric)
        
        mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = k_fold_cross_validation(X_svd_high, y.values, 10, knn)

        # calculate mean and standard deviation of fold results
        accuracy_means.append(mean_accuracy)
        accuracy_stds.append(std_accuracy)
        precision_means.append(mean_precision)
        precision_stds.append(std_precision)
        recall_means.append(mean_recall)
        recall_stds.append(std_recall)
        f1_score_means.append(mean_f1_score)
        f1_score_stds.append(std_f1_score)
        
    # plot results
    #fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, accuracy_means, yerr=accuracy_stds, capsize=10)
    plt.ylabel('Accuracy Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("AccSVDHI"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, precision_means, yerr=precision_stds, capsize=10)
    plt.ylabel('Precision Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("PrecisionSVDHI"+metric)
    plt.show()
    
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, recall_means, yerr=recall_stds, capsize=10)
    plt.ylabel('Recall Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("RecallSVDHI"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, f1_score_means, yerr=f1_score_stds, capsize=10)
    plt.ylabel('F1 Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("F1SVDHI"+metric)
    plt.show()
    
    if(max(f1_score_means)>maxF1):
        maxF1=max(f1_score_means)
        maxk=f1_score_means.index(max(f1_score_means))+1
        maxDmetric=metric
        maxFeatureRep='SVD High'
        recallOfMaxE=recall_means[f1_score_means.index(max(f1_score_means))]
        precisionOfMax=precision_means[f1_score_means.index(max(f1_score_means))]
        accuracyOfMax=accuracy_means[f1_score_means.index(max(f1_score_means))]





## KNN AutoEncode 5% org. features - K Fold Cross validation with Hyperperformance tuning

In [None]:
#KNN with AutoEncoder 5% of features
n_folds = 10
k_vals = range(1, 31)
dist_metrics = ['euclidean', 'manhattan']
#Configuration for autoencoder 5% of features
reg5 = MLPRegressor(hidden_layer_sizes = (n_encoder1, n_encoder2, n_bottleneck_1, n_decoder2, n_decoder1),  
                    solver = 'adam',
                    activation='relu',
                    learning_rate_init = 0.001, 
                    max_iter = 1000,  
                    verbose = False)
reg5.fit(X.values, X.values)
hidden_layer = reg5.hidden_layer_sizes[1]
X_bottleneck_5 = reg5.predict(X.values)[:, :hidden_layer]


for metric in dist_metrics:
    # initialize lists to store results
    accuracy_means = []
    accuracy_stds = []
    precision_means = []
    precision_stds = []
    recall_means = []
    recall_stds = []
    f1_score_means = []
    f1_score_stds = []
    
    for k in k_vals:
        # initialize KNN model
        knn = KNN(n_neighbors=k, distance_metric=metric)
        
        mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = k_fold_cross_validation(X_bottleneck_5, y.values, 10, knn)

        # calculate mean and standard deviation of fold results
        accuracy_means.append(mean_accuracy)
        accuracy_stds.append(std_accuracy)
        precision_means.append(mean_precision)
        precision_stds.append(std_precision)
        recall_means.append(mean_recall)
        recall_stds.append(std_recall)
        f1_score_means.append(mean_f1_score)
        f1_score_stds.append(std_f1_score)
        
    # plot results
    #fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, accuracy_means, yerr=accuracy_stds, capsize=10)
    plt.ylabel('Accuracy Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("accae5"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, precision_means, yerr=precision_stds, capsize=10)
    plt.ylabel('Precision Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("precae5"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, recall_means, yerr=recall_stds, capsize=10)
    plt.ylabel('Recall Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("recallae5"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, f1_score_means, yerr=f1_score_stds, capsize=10)
    plt.ylabel('F1 Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("F1ae5"+metric)
    plt.show()
    
    if(max(f1_score_means)>maxF1):
        maxF1=max(f1_score_means)
        maxk=f1_score_means.index(max(f1_score_means))+1
        maxDmetric=metric
        maxFeatureRep='AE 5%'
        recallOfMaxE=recall_means[f1_score_means.index(max(f1_score_means))]
        precisionOfMax=precision_means[f1_score_means.index(max(f1_score_means))]
        accuracyOfMax=accuracy_means[f1_score_means.index(max(f1_score_means))]




## KNN AutoEncode 20% org. features - K Fold Cross validation with Hyperperformance tuning

In [None]:
#KNN with AutoEncoder 20% of features
n_folds = 10
k_vals = range(1, 31)
dist_metrics = ['euclidean', 'manhattan']
#Configuration for autoencoder 5% of features
reg20 = MLPRegressor(hidden_layer_sizes = (n_encoder1, n_encoder2, n_bottleneck_2, n_decoder2, n_decoder1),  
                    solver = 'adam',
                    activation='relu',
                    learning_rate_init = 0.001, 
                    max_iter = 1000,  
                    verbose = False)
reg20.fit(X.values, X.values)
hidden_layer = reg20.hidden_layer_sizes[1]
X_bottleneck_20 = reg20.predict(X.values)[:, :hidden_layer]


for metric in dist_metrics:
    # initialize lists to store results
    accuracy_means = []
    accuracy_stds = []
    precision_means = []
    precision_stds = []
    recall_means = []
    recall_stds = []
    f1_score_means = []
    f1_score_stds = []
    
    for k in k_vals:
        # initialize KNN model
        knn = KNN(n_neighbors=k, distance_metric=metric)
        
        mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1_score, std_f1_score = k_fold_cross_validation(X_bottleneck_20, y.values, 10, knn)

        # calculate mean and standard deviation of fold results
        accuracy_means.append(mean_accuracy)
        accuracy_stds.append(std_accuracy)
        precision_means.append(mean_precision)
        precision_stds.append(std_precision)
        recall_means.append(mean_recall)
        recall_stds.append(std_recall)
        f1_score_means.append(mean_f1_score)
        f1_score_stds.append(std_f1_score)
        
    # plot results
    #fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, accuracy_means, yerr=accuracy_stds, capsize=10)
    plt.ylabel('Accuracy Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("accae20"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, precision_means, yerr=precision_stds, capsize=10)
    plt.ylabel('Precision Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("precae20"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, recall_means, yerr=recall_stds, capsize=10)
    plt.ylabel('Recall Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("recallae20"+metric)
    plt.show()
    
    plt.figure(figsize=(8,8))
    plt.errorbar(k_vals, f1_score_means, yerr=f1_score_stds, capsize=10)
    plt.ylabel('F1 Score')
    plt.xlabel('K neighbors')
    plt.ylim(0.5, 1.2)
    plt.xticks(k_vals)
    plt.title(f'K-Fold Cross Validation - {metric}')
    #plt.savefig("F1ae20"+metric)
    plt.show()
    
    if(max(f1_score_means)>maxF1):
        maxF1=max(f1_score_means)
        maxk=f1_score_means.index(max(f1_score_means))+1
        maxDmetric=metric
        maxFeatureRep='AE 20%'
        recallOfMaxE=recall_means[f1_score_means.index(max(f1_score_means))]
        precisionOfMax=precision_means[f1_score_means.index(max(f1_score_means))]
        accuracyOfMax=accuracy_means[f1_score_means.index(max(f1_score_means))]



In [None]:
print("Best performance detected for :")
print("F1 Score:")
print(maxF1)
print("Precision: ")
print(recallOfMaxE)
print("Recall: ")
print(precisionOfMax)
print("Accuracy: ")
print(accuracyOfMax)
print("K: ")
print(maxk)
print("Distance: ")
print(maxDmetric)
print("Feature Rep: ")
print(maxFeatureRep)