In [120]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
from itertools import product
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import torch.nn.init as init

In [121]:
SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('axes', titlesize=MEDIUM_SIZE)    # fontsize of the figure suptitle
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [122]:
def preprocess_monk(file_name):
    '''
    Load data from the Monk dataset and preprocess using one-hot encoding.

    Parameters:
    - file_name (str): The file name of the dataset.

    Returns:
    - x (torch.Tensor): Input data after one-hot encoding.
    - y (torch.Tensor): Target data.
    '''

    # load the dataset, split into input (X) and output (y) variables
    df = pd.read_csv(file_name, delimiter=' ', header=None,
                     names=['_','target', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'data_number'],
                     index_col=False)

    # Extract input features and target variable
    x1, x2, x3, x4, x5, x6, target = (np.array(df[feature]) for feature in ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'target'])

    # Initialize OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False)

    # Reshape and transform x1 using one-hot encoding
    input_one_hot = encoder.fit_transform(x1.reshape(-1, 1))

    # Loop through the remaining input features and concatenate one-hot encoded values
    for x in [x2,x3,x4,x5,x6]:
        data =x.reshape(-1, 1)
        one_hot_encoded = encoder.fit_transform(data)
        input_one_hot = np.hstack((input_one_hot, one_hot_encoded))

    # Convert to PyTorch tensors
    x = torch.tensor(input_one_hot, dtype=torch.float32)#.cuda()
    y = torch.tensor(target, dtype=torch.float32).reshape(-1,1)#.cuda()

    return x, y

In [123]:
from sklearn import svm
from sklearn.metrics import accuracy_score


def perform_grid_search_kfold(kernels, Cs, degrees, gammas, k_folds, x, y):
    """
    Perform grid search with k-fold cross-validation for Support Vector Classification hyperparameters.

    Parameters:
    - kernels (list): List of kernel types to search.
    - Cs (list): List of regularization parameters to search.
    - degrees (list): List of degrees for polynomial kernels.
    - gammas (list): List of gamma values for RBF/polynomial/sigmoid kernels.
    - k_folds (int): Number of folds for cross-validation.
    - x (numpy.ndarray): Input data.
    - y (numpy.ndarray): Target data.

    Returns:
    - list: Best hyperparameters.
    """

    best_accuracy = 0
    best_hyperparams = []
    counter = 0
    num_combinations = sum(1 for _ in product(kernels, Cs, degrees, gammas))
    print('Total number of grid search combinations explored:', num_combinations)

    for kernel, C, degree, gamma in product(kernels, Cs, degrees, gammas):
        counter += 1
        print(f'{counter}/{num_combinations} Hyperparams:', kernel, C, degree, gamma)

        kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
        accuracies = []

        # Perform K-fold cross-validation
        for train_indices, val_indices in kf.split(x,y):
            # Split the data into training and validation sets
            X_train, X_val = x[train_indices], x[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]

            model = svm.SVC(kernel=kernel, C=C, degree=degree, gamma=gamma)
            model.fit(X_train, y_train)
            predictions = model.predict(X_val)
            accuracies.append(accuracy_score(y_val, predictions))

        mean_accuracy = np.mean(accuracies)
        std_accuracy = np.std(accuracies)
        print(f'Current Results: kernel={kernel}; C={C}; degree={degree}; gamma={gamma} --> '
              f'accuracy = {mean_accuracy:.4f}+{std_accuracy:.4}')

        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_hyperparams = [kernel, C, degree, gamma]

    print(f'Best Hyperparameters: {best_hyperparams} with Accuracy = {best_accuracy:.4f}+-{std_accuracy:.4}')
    return best_hyperparams

# MONK 1

In [124]:
X_train, y_train = preprocess_monk(file_name='monk_data/monks-1.train')
X_test, y_test = preprocess_monk(file_name='monk_data/monks-1.test')

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

torch.Size([124, 17])
torch.Size([124, 1])
torch.Size([432, 17])
torch.Size([432, 1])


# Coarse Grid Searches (to find best kernel)
we perform two grid searches. One with the rbf kernel, and one with polinomial kernel.
In the grid search with rbf, we include the hyperparameter 'gamma', related to the std of the gaussian kernel.
In the case of the polinomial kernel we include the degree of the polinomial as hyperparameter.
We compare the results of k-fold cross validation to choose the kernel most suitable for our problem.

In [125]:
'''
#first coarse grid search
kernels = ['rbf']
Cs = [0.01,0.1,1,10,100,1000]
degrees=[0] #only relevant for poly kernel
gammas = ['scale',0.01,0.1,1,10] #related to sigma in rbf
#Best Hyperparameters: ['rbf', 1000, 0, 0.01] with Accuracy = 0.8962+-0.1292
'''


kernels = ['poly']
Cs = [0.01,0.1,1,10,100,1000]
degrees = np.arange(3,30,1)
gammas = ['scale'] # related to sigma in rbf
#Best Hyperparameters: ['poly', 10, 3, 'scale'] with Accuracy = 0.9119+-0.07267


best_hyperparams = perform_grid_search_kfold(kernels,
                          Cs,
                          degrees,
                          gammas,
                          k_folds=3,
                          x=X_train,
                          y=y_train.ravel())

Total number of grid search combinations explored: 162
1/162 Hyperparams: poly 0.01 3 scale
Current Results: kernel=poly; C=0.01; degree=3; gamma=scale --> accuracy = 0.5633+0.1068
2/162 Hyperparams: poly 0.01 4 scale
Current Results: kernel=poly; C=0.01; degree=4; gamma=scale --> accuracy = 0.5712+0.118
3/162 Hyperparams: poly 0.01 5 scale
Current Results: kernel=poly; C=0.01; degree=5; gamma=scale --> accuracy = 0.5792+0.1292
4/162 Hyperparams: poly 0.01 6 scale
Current Results: kernel=poly; C=0.01; degree=6; gamma=scale --> accuracy = 0.5871+0.1404
5/162 Hyperparams: poly 0.01 7 scale
Current Results: kernel=poly; C=0.01; degree=7; gamma=scale --> accuracy = 0.5792+0.1292
6/162 Hyperparams: poly 0.01 8 scale
Current Results: kernel=poly; C=0.01; degree=8; gamma=scale --> accuracy = 0.5712+0.118
7/162 Hyperparams: poly 0.01 9 scale
Current Results: kernel=poly; C=0.01; degree=9; gamma=scale --> accuracy = 0.6444+0.06699
8/162 Hyperparams: poly 0.01 10 scale
Current Results: kernel=po

# Finer Grid Search
the best kernel is polinomial, now let's study the other hyperparameters more in detail

In [126]:
kernels = ['poly']
Cs = np.arange(500,5000,50)
degrees = np.arange(3,30,1)
gammas = ['scale'] # related to sigma in rbf
#Best Hyperparameters: ['poly', 10, 3, 'scale'] with Accuracy = 0.9119+-0.07267


best_hyperparams = perform_grid_search_kfold(kernels,
                          Cs,
                          degrees,
                          gammas,
                          k_folds=3,
                          x=X_train,
                          y=y_train.ravel())

Total number of grid search combinations explored: 2430
1/2430 Hyperparams: poly 500 3 scale
Current Results: kernel=poly; C=500; degree=3; gamma=scale --> accuracy = 0.9119+0.05903
2/2430 Hyperparams: poly 500 4 scale
Current Results: kernel=poly; C=500; degree=4; gamma=scale --> accuracy = 0.8471+0.02836
3/2430 Hyperparams: poly 500 5 scale
Current Results: kernel=poly; C=500; degree=5; gamma=scale --> accuracy = 0.8389+0.02081
4/2430 Hyperparams: poly 500 6 scale
Current Results: kernel=poly; C=500; degree=6; gamma=scale --> accuracy = 0.8066+0.01782
5/2430 Hyperparams: poly 500 7 scale
Current Results: kernel=poly; C=500; degree=7; gamma=scale --> accuracy = 0.7985+0.02187
6/2430 Hyperparams: poly 500 8 scale
Current Results: kernel=poly; C=500; degree=8; gamma=scale --> accuracy = 0.7824+0.0176
7/2430 Hyperparams: poly 500 9 scale
Current Results: kernel=poly; C=500; degree=9; gamma=scale --> accuracy = 0.7824+0.0176
8/2430 Hyperparams: poly 500 10 scale
Current Results: kernel=po

In [127]:
kernel = 'poly'
C = 10
degree = 3
gamma = 'scale'

model = svm.SVC(kernel=kernel, C=C, degree=degree, gamma=gamma)
model.fit(X_train, y_train.ravel())
print(f'Training Accuracy = {accuracy_score(model.predict(X_train),y_train)}')
print(f'Test Accuracy = {accuracy_score(model.predict(X_test),y_test)}')

Training Accuracy = 1.0
Test Accuracy = 1.0


# MONK 2

In [128]:
X_train, y_train = preprocess_monk(file_name='monk_data/monks-2.train')
X_test, y_test = preprocess_monk(file_name='monk_data/monks-2.test')

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

torch.Size([169, 17])
torch.Size([169, 1])
torch.Size([432, 17])
torch.Size([432, 1])


# Coarse Grid Searches (to find best kernel)
we perform two grid searches. One with the rbf kernel, and one with polinomial kernel.
In the grid search with rbf, we include the hyperparameter 'gamma', related to the std of the gaussian kernel.
In the case of the polinomial kernel we include the degree of the polinomial as hyperparameter.
We compare the results of k-fold cross validation to choose the kernel most suitable for our problem.

In [129]:
#first coarse grid search
kernels = ['rbf']
Cs = [0.01,0.1,1,10,100,1000]
degrees=[0] #only relevant for poly kernel
gammas = ['scale',0.01,0.1,1,10] #related to sigma in rbf
#Best Hyperparameters: ['rbf', 100, 0, 0.1] with Accuracy = 0.6983+-0.005169



'''
kernels = ['poly']
Cs = [0.01,0.1,1,10,100,1000]
degrees = np.arange(3,30,1)
gammas = ['scale'] # related to sigma in rbf
#Best Hyperparameters: ['poly', 10, 3, 'scale'] with Accuracy = 0.6982+-0.005169
'''

best_hyperparams = perform_grid_search_kfold(kernels,
                          Cs,
                          degrees,
                          gammas,
                          k_folds=3,
                          x=X_train,
                          y=y_train.ravel())

Total number of grid search combinations explored: 30
1/30 Hyperparams: rbf 0.01 0 scale
Current Results: kernel=rbf; C=0.01; degree=0; gamma=scale --> accuracy = 0.6213+0.005169
2/30 Hyperparams: rbf 0.01 0 0.01
Current Results: kernel=rbf; C=0.01; degree=0; gamma=0.01 --> accuracy = 0.6213+0.005169
3/30 Hyperparams: rbf 0.01 0 0.1
Current Results: kernel=rbf; C=0.01; degree=0; gamma=0.1 --> accuracy = 0.6213+0.005169
4/30 Hyperparams: rbf 0.01 0 1
Current Results: kernel=rbf; C=0.01; degree=0; gamma=1 --> accuracy = 0.6213+0.005169
5/30 Hyperparams: rbf 0.01 0 10
Current Results: kernel=rbf; C=0.01; degree=0; gamma=10 --> accuracy = 0.6213+0.005169
6/30 Hyperparams: rbf 0.1 0 scale
Current Results: kernel=rbf; C=0.1; degree=0; gamma=scale --> accuracy = 0.6213+0.005169
7/30 Hyperparams: rbf 0.1 0 0.01


Current Results: kernel=rbf; C=0.1; degree=0; gamma=0.01 --> accuracy = 0.6213+0.005169
8/30 Hyperparams: rbf 0.1 0 0.1
Current Results: kernel=rbf; C=0.1; degree=0; gamma=0.1 --> accuracy = 0.6213+0.005169
9/30 Hyperparams: rbf 0.1 0 1
Current Results: kernel=rbf; C=0.1; degree=0; gamma=1 --> accuracy = 0.6213+0.005169
10/30 Hyperparams: rbf 0.1 0 10
Current Results: kernel=rbf; C=0.1; degree=0; gamma=10 --> accuracy = 0.6213+0.005169
11/30 Hyperparams: rbf 1 0 scale
Current Results: kernel=rbf; C=1; degree=0; gamma=scale --> accuracy = 0.6096+0.02615
12/30 Hyperparams: rbf 1 0 0.01
Current Results: kernel=rbf; C=1; degree=0; gamma=0.01 --> accuracy = 0.6213+0.005169
13/30 Hyperparams: rbf 1 0 0.1
Current Results: kernel=rbf; C=1; degree=0; gamma=0.1 --> accuracy = 0.6094+0.01494
14/30 Hyperparams: rbf 1 0 1
Current Results: kernel=rbf; C=1; degree=0; gamma=1 --> accuracy = 0.6271+0.0172
15/30 Hyperparams: rbf 1 0 10
Current Results: kernel=rbf; C=1; degree=0; gamma=10 --> accuracy = 

# Finer Grid Search
the best kernel is polinomial, now let's study the other hyperparameters more in detail

In [130]:
kernels = ['rbf']
Cs = np.arange(500,5000,500)
degrees = [0]
gammas =  ['scale',0.1,0.2,0.3,0.4,0.5,0.6,0.7] # related to sigma in rbf
#Best Hyperparameters: ['rbf', 500, 0, 0.1] with Accuracy = 0.6983+-0.03516

best_hyperparams = perform_grid_search_kfold(kernels,
                          Cs,
                          degrees,
                          gammas,
                          k_folds=3,
                          x=X_train,
                          y=y_train.ravel())

Total number of grid search combinations explored: 72
1/72 Hyperparams: rbf 500 0 scale
Current Results: kernel=rbf; C=500; degree=0; gamma=scale --> accuracy = 0.6447+0.02791
2/72 Hyperparams: rbf 500 0 0.1


Current Results: kernel=rbf; C=500; degree=0; gamma=0.1 --> accuracy = 0.6983+0.02404
3/72 Hyperparams: rbf 500 0 0.2
Current Results: kernel=rbf; C=500; degree=0; gamma=0.2 --> accuracy = 0.6685+0.01831
4/72 Hyperparams: rbf 500 0 0.3
Current Results: kernel=rbf; C=500; degree=0; gamma=0.3 --> accuracy = 0.6390+0.01022
5/72 Hyperparams: rbf 500 0 0.4
Current Results: kernel=rbf; C=500; degree=0; gamma=0.4 --> accuracy = 0.6449+0.01488
6/72 Hyperparams: rbf 500 0 0.5
Current Results: kernel=rbf; C=500; degree=0; gamma=0.5 --> accuracy = 0.6389+0.01964
7/72 Hyperparams: rbf 500 0 0.6
Current Results: kernel=rbf; C=500; degree=0; gamma=0.6 --> accuracy = 0.6448+0.03958
8/72 Hyperparams: rbf 500 0 0.7
Current Results: kernel=rbf; C=500; degree=0; gamma=0.7 --> accuracy = 0.6389+0.03516
9/72 Hyperparams: rbf 1000 0 scale
Current Results: kernel=rbf; C=1000; degree=0; gamma=scale --> accuracy = 0.6447+0.02791
10/72 Hyperparams: rbf 1000 0 0.1
Current Results: kernel=rbf; C=1000; degree=0; g

In [131]:
kernel = 'rbf'
C = 500
gamma = 0.1

model = svm.SVC(kernel=kernel, C=C, gamma=gamma)
model.fit(X_train, y_train.ravel())
print(f'Training Accuracy = {accuracy_score(model.predict(X_train),y_train)}')
print(f'Test Accuracy = {accuracy_score(model.predict(X_test),y_test)}')

Training Accuracy = 1.0
Test Accuracy = 0.8148148148148148


# MONK 3

In [132]:
X_train, y_train = preprocess_monk(file_name='monk_data/monks-3.train')
X_test, y_test = preprocess_monk(file_name='monk_data/monks-3.test')

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

torch.Size([122, 17])
torch.Size([122, 1])
torch.Size([432, 17])
torch.Size([432, 1])


# Coarse Grid Searches (to find best kernel)
we perform two grid searches. One with the rbf kernel, and one with polinomial kernel.
In the grid search with rbf, we include the hyperparameter 'gamma', related to the std of the gaussian kernel.
In the case of the polinomial kernel we include the degree of the polinomial as hyperparameter.
We compare the results of k-fold cross validation to choose the kernel most suitable for our problem.

In [133]:
'''
#first coarse grid search
kernels = ['rbf']
Cs = [0.01,0.1,1,10,100,1000]
degrees=[0] #only relevant for poly kernel
gammas = ['scale',0.01,0.1,1,10] #related to sigma in rbf
#Best Hyperparameters: ['rbf', 10, 0, 0.1] with Accuracy = 0.9348+-0.005749
'''


kernels = ['poly']
Cs = [0.01,0.1,1,10,100,1000]
degrees = np.arange(3,30,1)
gammas = ['scale'] # related to sigma in rbf
#Best Hyperparameters: ['poly', 0.1, 5, 'scale'] with Accuracy = 0.9429+-0.04706

best_hyperparams = perform_grid_search_kfold(kernels,
                          Cs,
                          degrees,
                          gammas,
                          k_folds=3,
                          x=X_train,
                          y=y_train.ravel())

Total number of grid search combinations explored: 162
1/162 Hyperparams: poly 0.01 3 scale
Current Results: kernel=poly; C=0.01; degree=3; gamma=scale --> accuracy = 0.5081+0.005749
2/162 Hyperparams: poly 0.01 4 scale
Current Results: kernel=poly; C=0.01; degree=4; gamma=scale --> accuracy = 0.5081+0.005749
3/162 Hyperparams: poly 0.01 5 scale
Current Results: kernel=poly; C=0.01; degree=5; gamma=scale --> accuracy = 0.5081+0.005749
4/162 Hyperparams: poly 0.01 6 scale
Current Results: kernel=poly; C=0.01; degree=6; gamma=scale --> accuracy = 0.5081+0.005749
5/162 Hyperparams: poly 0.01 7 scale
Current Results: kernel=poly; C=0.01; degree=7; gamma=scale --> accuracy = 0.5081+0.005749
6/162 Hyperparams: poly 0.01 8 scale
Current Results: kernel=poly; C=0.01; degree=8; gamma=scale --> accuracy = 0.5081+0.005749
7/162 Hyperparams: poly 0.01 9 scale
Current Results: kernel=poly; C=0.01; degree=9; gamma=scale --> accuracy = 0.5817+0.02453
8/162 Hyperparams: poly 0.01 10 scale
Current Resu

# Finer Grid Search
the best kernel is polinomial, now let's study the other hyperparameters more in detail

In [134]:
kernels = ['poly']
Cs = np.arange(0.01,0.2,0.01)
degrees = np.arange(3,10,1)
gammas = ['scale'] # related to sigma in rbf
#Best Hyperparameters: ['poly', 0.08, 5, 'scale'] with Accuracy = 0.9429+-0.01211


best_hyperparams = perform_grid_search_kfold(kernels,
                          Cs,
                          degrees,
                          gammas,
                          k_folds=3,
                          x=X_train,
                          y=y_train.ravel())

Total number of grid search combinations explored: 133
1/133 Hyperparams: poly 0.01 3 scale
Current Results: kernel=poly; C=0.01; degree=3; gamma=scale --> accuracy = 0.5081+0.005749
2/133 Hyperparams: poly 0.01 4 scale


Current Results: kernel=poly; C=0.01; degree=4; gamma=scale --> accuracy = 0.5081+0.005749
3/133 Hyperparams: poly 0.01 5 scale
Current Results: kernel=poly; C=0.01; degree=5; gamma=scale --> accuracy = 0.5081+0.005749
4/133 Hyperparams: poly 0.01 6 scale
Current Results: kernel=poly; C=0.01; degree=6; gamma=scale --> accuracy = 0.5081+0.005749
5/133 Hyperparams: poly 0.01 7 scale
Current Results: kernel=poly; C=0.01; degree=7; gamma=scale --> accuracy = 0.5081+0.005749
6/133 Hyperparams: poly 0.01 8 scale
Current Results: kernel=poly; C=0.01; degree=8; gamma=scale --> accuracy = 0.5081+0.005749
7/133 Hyperparams: poly 0.01 9 scale
Current Results: kernel=poly; C=0.01; degree=9; gamma=scale --> accuracy = 0.5817+0.02453
8/133 Hyperparams: poly 0.02 3 scale
Current Results: kernel=poly; C=0.02; degree=3; gamma=scale --> accuracy = 0.5081+0.005749
9/133 Hyperparams: poly 0.02 4 scale
Current Results: kernel=poly; C=0.02; degree=4; gamma=scale --> accuracy = 0.5081+0.005749
10/133 Hyperpa

In [135]:
kernel = 'poly'
C = 0.11
degree = 4
gamma = 'scale'

model = svm.SVC(kernel=kernel, C=C, degree=degree, gamma=gamma)
model.fit(X_train, y_train.ravel())
print(f'Training Accuracy = {accuracy_score(model.predict(X_train),y_train)}')
print(f'Test Accuracy = {accuracy_score(model.predict(X_test),y_test)}')

Training Accuracy = 0.9590163934426229
Test Accuracy = 0.9768518518518519
