In [25]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
from itertools import product
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import torch.nn.init as init
from sklearn.metrics import accuracy_score

In [26]:
SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('axes', titlesize=MEDIUM_SIZE)    # fontsize of the figure suptitle
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [27]:
def preprocess_monk(file_name):
    '''
    Load data from the Monk dataset and preprocess using one-hot encoding.

    Parameters:
    - file_name (str): The file name of the dataset.

    Returns:
    - x (torch.Tensor): Input data after one-hot encoding.
    - y (torch.Tensor): Target data.
    '''

    # load the dataset, split into input (X) and output (y) variables
    df = pd.read_csv(file_name, delimiter=' ', header=None,
                     names=['_','target', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'data_number'],
                     index_col=False)

    # Extract input features and target variable
    x1, x2, x3, x4, x5, x6, target = (np.array(df[feature]) for feature in ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'target'])

    # Initialize OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False)

    # Reshape and transform x1 using one-hot encoding
    input_one_hot = encoder.fit_transform(x1.reshape(-1, 1))

    # Loop through the remaining input features and concatenate one-hot encoded values
    for x in [x2,x3,x4,x5,x6]:
        data =x.reshape(-1, 1)
        one_hot_encoded = encoder.fit_transform(data)
        input_one_hot = np.hstack((input_one_hot, one_hot_encoded))

    # Convert to PyTorch tensors
    x = torch.tensor(input_one_hot, dtype=torch.float32)#.cuda()
    y = torch.tensor(target, dtype=torch.float32).reshape(-1,1)#.cuda()

    return x, y

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
from itertools import product

def perform_grid_search_kfold(ks, weightss, ps, k_folds, X, y):
    """
    Perform grid search to find the best hyperparameters for K-Nearest Neighbors Classifier
    using K-Fold cross-validation based on accuracy.

    Iterate over all combinations of specified hyperparameters,
    evaluates performance using K-Fold cross-validation, and identify the best combination based
    on the highest accuracy.

    Parameters:
    - ks (list): A list of integer values to try for 'n_neighbors' in KNN.
    - weightss (list): A list of weight options (e.g., 'uniform', 'distance') to try in KNN.
    - ps (list): A list of integer values to try for the 'p' parameter in KNN (power parameter for the Minkowski metric).
    - k_folds (int): The number of folds for K-Fold cross-validation.
    - X (array-like): Feature dataset for training the model.
    - y (array-like): Target variable dataset for training the model.

    Returns:
    - final_model (KNeighborsClassifier object): The trained KNN classifier model with the best found hyperparameters.
    - (best_k, best_weight, best_p) (tuple): The best combination of hyperparameters found.
    - (accuracies_mean, accuracies_sd) (tuple): Lists of mean and standard deviation of accuracy for each hyperparameter combination.

    The function prints the accuracy for each combination of hyperparameters during the grid search and
    finally the best hyperparameters with their corresponding accuracy.
    """
    accuracies_mean = []
    accuracies_sd = []
    best_k = None
    best_score = 0

    # K-Fold cross-validation
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Trying different combinations of hyperparams
    for k, weights, p in product(ks, weightss, ps):
        model = KNeighborsClassifier(n_neighbors=k, weights=weights, p=p)
        scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
        mean_score = scores.mean()
        accuracies_mean.append(mean_score)
        accuracies_sd.append(scores.std())
        print(f"k={k}, weights={weights}, p={p}, Accuracy: {mean_score:.4f}")

        if mean_score > best_score:
            best_k = k
            best_weight = weights
            best_p = p
            best_score = mean_score

    print(f"The best parameters are k={best_k}, weights={best_weight}, p={best_p} with an accuracy of {best_score:.4f}")

    # Train the final model with the best parameters
    final_model = KNeighborsClassifier(n_neighbors=best_k, weights=best_weight, p=best_p)
    final_model.fit(X, y)

    return final_model, (best_k, best_weight, best_p), (accuracies_mean, accuracies_sd)


# MONK 1

In [29]:
X_train, y_train = preprocess_monk(file_name='monk_data/monks-1.train')
X_test, y_test = preprocess_monk(file_name='monk_data/monks-1.test')

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

torch.Size([124, 17])
torch.Size([124, 1])
torch.Size([432, 17])
torch.Size([432, 1])


In [30]:
# Grid Search
ks = range(1, 31)
weightss=['uniform','distance']
ps = range(1,10)
_, best_hp, _ = perform_grid_search_kfold(ks,weightss,ps,k_folds=3,X=X_train,y=y_train.ravel())

k=1, weights=uniform, p=1, Accuracy: 0.6616
k=1, weights=uniform, p=2, Accuracy: 0.6616
k=1, weights=uniform, p=3, Accuracy: 0.6616
k=1, weights=uniform, p=4, Accuracy: 0.6616
k=1, weights=uniform, p=5, Accuracy: 0.6616
k=1, weights=uniform, p=6, Accuracy: 0.6616
k=1, weights=uniform, p=7, Accuracy: 0.6616
k=1, weights=uniform, p=8, Accuracy: 0.6616
k=1, weights=uniform, p=9, Accuracy: 0.6616
k=1, weights=distance, p=1, Accuracy: 0.6616
k=1, weights=distance, p=2, Accuracy: 0.6616
k=1, weights=distance, p=3, Accuracy: 0.6616
k=1, weights=distance, p=4, Accuracy: 0.6616
k=1, weights=distance, p=5, Accuracy: 0.6616
k=1, weights=distance, p=6, Accuracy: 0.6616
k=1, weights=distance, p=7, Accuracy: 0.6616
k=1, weights=distance, p=8, Accuracy: 0.6616
k=1, weights=distance, p=9, Accuracy: 0.6616
k=2, weights=uniform, p=1, Accuracy: 0.7176
k=2, weights=uniform, p=2, Accuracy: 0.7176
k=2, weights=uniform, p=3, Accuracy: 0.7176
k=2, weights=uniform, p=4, Accuracy: 0.7176
k=2, weights=uniform, p

In [31]:
model = KNeighborsClassifier(n_neighbors= best_hp[0], weights= best_hp[1], p= best_hp[2])
model.fit(X_train, y_train.ravel())
print(f'Training Accuracy = {accuracy_score(model.predict(X_train),y_train)}')
print(f'Test Accuracy = {accuracy_score(model.predict(X_test),y_test)}')

Training Accuracy = 1.0
Test Accuracy = 0.8078703703703703


# MONK 2

In [32]:
X_train, y_train = preprocess_monk(file_name='monk_data/monks-2.train')
X_test, y_test = preprocess_monk(file_name='monk_data/monks-2.test')

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

torch.Size([169, 17])
torch.Size([169, 1])
torch.Size([432, 17])
torch.Size([432, 1])


In [33]:
# Grid Search
ks = range(1, 31)
weightss=['uniform','distance']
ps = range(1,10)
_, best_hp, _ = perform_grid_search_kfold(ks,weightss,ps,k_folds=3,X=X_train,y=y_train.ravel())

k=1, weights=uniform, p=1, Accuracy: 0.6276
k=1, weights=uniform, p=2, Accuracy: 0.6276
k=1, weights=uniform, p=3, Accuracy: 0.6276
k=1, weights=uniform, p=4, Accuracy: 0.6276
k=1, weights=uniform, p=5, Accuracy: 0.6276
k=1, weights=uniform, p=6, Accuracy: 0.6276
k=1, weights=uniform, p=7, Accuracy: 0.6276
k=1, weights=uniform, p=8, Accuracy: 0.6276
k=1, weights=uniform, p=9, Accuracy: 0.6276
k=1, weights=distance, p=1, Accuracy: 0.6276
k=1, weights=distance, p=2, Accuracy: 0.6276
k=1, weights=distance, p=3, Accuracy: 0.6276
k=1, weights=distance, p=4, Accuracy: 0.6276
k=1, weights=distance, p=5, Accuracy: 0.6276
k=1, weights=distance, p=6, Accuracy: 0.6276
k=1, weights=distance, p=7, Accuracy: 0.6276
k=1, weights=distance, p=8, Accuracy: 0.6276
k=1, weights=distance, p=9, Accuracy: 0.6276
k=2, weights=uniform, p=1, Accuracy: 0.5798
k=2, weights=uniform, p=2, Accuracy: 0.5798
k=2, weights=uniform, p=3, Accuracy: 0.5798
k=2, weights=uniform, p=4, Accuracy: 0.5798
k=2, weights=uniform, p

In [34]:
model = KNeighborsClassifier(n_neighbors= best_hp[0], weights= best_hp[1], p= best_hp[2])
model.fit(X_train, y_train.ravel())
print(f'Training Accuracy = {accuracy_score(model.predict(X_train),y_train)}')
print(f'Test Accuracy = {accuracy_score(model.predict(X_test),y_test)}')

Training Accuracy = 1.0
Test Accuracy = 0.7893518518518519


# MONK 3

In [38]:
X_train, y_train = preprocess_monk(file_name='monk_data/monks-3.train')
X_test, y_test = preprocess_monk(file_name='monk_data/monks-3.test')

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

torch.Size([122, 17])
torch.Size([122, 1])
torch.Size([432, 17])
torch.Size([432, 1])


In [39]:
# Grid Search
ks = range(1, 31)
weightss=['uniform','distance']
ps = range(1,10)
_, best_hp, _ = perform_grid_search_kfold(ks,weightss,ps,k_folds=3,X=X_train,y=y_train.ravel())

k=1, weights=uniform, p=1, Accuracy: 0.8362
k=1, weights=uniform, p=2, Accuracy: 0.8362
k=1, weights=uniform, p=3, Accuracy: 0.8362
k=1, weights=uniform, p=4, Accuracy: 0.8362
k=1, weights=uniform, p=5, Accuracy: 0.8362
k=1, weights=uniform, p=6, Accuracy: 0.8362
k=1, weights=uniform, p=7, Accuracy: 0.8362
k=1, weights=uniform, p=8, Accuracy: 0.8362
k=1, weights=uniform, p=9, Accuracy: 0.8362
k=1, weights=distance, p=1, Accuracy: 0.8362
k=1, weights=distance, p=2, Accuracy: 0.8362
k=1, weights=distance, p=3, Accuracy: 0.8362
k=1, weights=distance, p=4, Accuracy: 0.8362
k=1, weights=distance, p=5, Accuracy: 0.8362
k=1, weights=distance, p=6, Accuracy: 0.8362
k=1, weights=distance, p=7, Accuracy: 0.8362
k=1, weights=distance, p=8, Accuracy: 0.8362
k=1, weights=distance, p=9, Accuracy: 0.8362
k=2, weights=uniform, p=1, Accuracy: 0.8199
k=2, weights=uniform, p=2, Accuracy: 0.8199
k=2, weights=uniform, p=3, Accuracy: 0.8199
k=2, weights=uniform, p=4, Accuracy: 0.8199
k=2, weights=uniform, p

In [40]:
model = KNeighborsClassifier(n_neighbors= best_hp[0], weights= best_hp[1], p= best_hp[2])
model.fit(X_train, y_train.ravel())
print(f'Training Accuracy = {accuracy_score(model.predict(X_train),y_train)}')
print(f'Test Accuracy = {accuracy_score(model.predict(X_test),y_test)}')

Training Accuracy = 1.0
Test Accuracy = 0.9097222222222222
