In [2]:
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
from typing import Callable, Iterable

In [3]:
data = np.loadtxt('../support/zipcombo.dat')
# data = np.loadtxt('../support/dtrain123.dat')

In [4]:
data.shape

(9298, 257)

In [5]:
def split_80_20(data: np.ndarray) -> (np.ndarray, np.ndarray):
    """
    Splits 80% train 20% test

    :param data: sequence.
    :return: train_data, test_data: np.ndarray, np.ndarray
    """

    n = data.shape[0]
    train_size = int(n*0.8)
    return data[:train_size], data[train_size:]

In [6]:
def split_X_y(data: np.ndarray) -> (np.ndarray, np.ndarray):
    """
    Splits the data into datapoints and labels, X_train matrix and y_train;
    :param data: np.ndarray
    :return: X_train, y_train: np.ndarray, np.ndarray
    """

    return data[:, 1:], data[:, 0]

In [7]:
def shuffle_split(data):
    # np.random.seed(seed)
    shuffled = np.random.permutation(data)
    data_train, data_test = split_80_20(shuffled)
    X_train, y_train = split_X_y(data_train)
    X_test, y_test = split_X_y(data_test)

    assert X_train.shape[0] == y_train.size
    assert X_test.shape[0] == y_test.size

    print("Train data set size = %d" % X_train.shape[0])
    print("Test data set size = %d" % X_test.shape[0])

    return X_train, y_train, X_test, y_test

In [8]:
def get_error_percentage(y, y_preds):
    error = 100 * get_num_mistakes(actual=y, predicted=y_preds) / y.size
    # print("in-sample = % " + str(error))
    return error

def get_num_mistakes(actual: np.ndarray, predicted: np.ndarray) -> int:
    # or calculating by checking which alpha values are different than 0? alpha is 0 when the prediction matches
    diffs = actual - predicted
    n_mistakes = 0
    for diff in diffs:
        if diff != 0:
            n_mistakes += 1
    return n_mistakes

In [14]:
X_train, y_train, X_test, y_test = shuffle_split(data)

Train data set size = 7438
Test data set size = 1860


In [16]:
class KPOneVsAllClassifier:
    def __init__(self, n_classes, d):
        self.n_classes = n_classes # could optimize to remove this var since =k.shape[0]
        self.d = d
        self.X_train = None
        self.Alpha = None
        self.K = None

    def _get_kernel_matrix(self, X_train: np.ndarray) -> np.ndarray:
        ## we use ||x-y|| = np.inner(x-y, x-y)
        ## and ||x-y||^2 = ||x||^2 + ||y||^2 - 2*x.T*y
        X_norm = np.einsum('ij,ij->i', X_train, X_train)
        return np.exp((-self.d) * (X_norm[:, None] + X_norm[None, :] - 2*np.dot(X_train, X_train.T)))

    def sign(self, x: np.ndarray) -> np.ndarray:
        return np.where(x <= 0, -1, 1)

    def _predict_single_confidence(self, t, y_train):
        # Get prediction array P
        # preds = (self.Alpha @ self.K)[:, t]
        preds = self.Alpha @ self.K[t]

        # Get Y_t array of ground truth (duplicate y_t)
        y = np.full(self.n_classes, -1)
        y[int(y_train[t])] = 1

        # penilize
        self.Alpha[:, t] -= np.heaviside(-(preds * y), 1) * self.sign(preds)
        return preds


    def fit(self, X_train, y_train, n_epochs):
        y_preds = np.array(np.zeros(X_train.shape[0]))

        self.K = self._get_kernel_matrix(X_train)
        self.Alpha = np.zeros((self.n_classes, X_train.shape[0]))
        self.X_train = X_train

        for epoch in range(n_epochs):
            # for each point, calculate confidence and make predictions
            for t in range(0, X_train.shape[0]):
                confidence = self._predict_single_confidence(t, y_train)
                # the index with the highest number(confidence) is the prediction
                # +1 because the index for confidence in perceptron[1] is 0
                y_preds[t] = np.argmax(confidence)
        return y_preds

    def predict(self, X_test: np.ndarray):
        ## for polynomial kernel
        self.K_test = np.power(self.X_train @ X_test.T, self.d)
        return np.argmax((self.Alpha @ self.K_test), axis=0)


## for in-cell debug
import time

start = time.time()

kpova = KPOneVsAllClassifier(n_classes=10, d=10)
kpova.fit(X_train, y_train, n_epochs=1)
y_insample = kpova.predict(X_train)
y_outsample = kpova.predict(X_test)

end = time.time()

print("took :{t}".format(t=end-start))

print(get_error_percentage(y_train, y_insample))
print(get_error_percentage(y_test, y_outsample))

took :5.453933238983154
7.098682441516536
8.279569892473118


### Repeat 1 and 2
- do the parameter tuning d = c
- first, do some experiments to decide the values to cross_validate over for c

#### First, we run the 1.basic results algorithm to get an intuition of how $c$ impacts the errors, so that we can find out on which $S$ to cross validate. We set n_epochs=7

In [12]:
import time

means_train = []
means_test = []
stds_train = []
stds_test = []

start=time.time()
for c in range(1, 8):
    train_errors = []
    test_errors = []
    for run in range(20):
        X_train, y_train, X_test, y_test = shuffle_split(data)

        kpova = KPOneVsAllClassifier(n_classes=10, d=c)

        #train
        kpova.fit(X_train, y_train, n_epochs=7)
        y_insample = kpova.predict(X_train)
        #test
        y_outsample = kpova.predict(X_test)

        print('\nd=' + str(c) + ' on run ' + str(run))
        train_errors.append(get_error_percentage(y_train, y_insample))
        test_errors.append(get_error_percentage(y_test, y_outsample))
    means_train.append(np.mean(train_errors))
    means_test.append(np.mean(test_errors))
    stds_train.append(np.std(train_errors))
    stds_test.append(np.std(test_errors))
end = time.time()

print('took {t}'.format(t=end-start))

Train data set size = 7438
Test data set size = 1860

d=1 on run 0
Train data set size = 7438
Test data set size = 1860

d=1 on run 1
Train data set size = 7438
Test data set size = 1860

d=1 on run 2
Train data set size = 7438
Test data set size = 1860

d=1 on run 3
Train data set size = 7438
Test data set size = 1860

d=1 on run 4
Train data set size = 7438
Test data set size = 1860

d=1 on run 5
Train data set size = 7438
Test data set size = 1860

d=1 on run 6
Train data set size = 7438
Test data set size = 1860

d=1 on run 7
Train data set size = 7438
Test data set size = 1860

d=1 on run 8
Train data set size = 7438
Test data set size = 1860

d=1 on run 9
Train data set size = 7438
Test data set size = 1860

d=1 on run 10
Train data set size = 7438
Test data set size = 1860

d=1 on run 11
Train data set size = 7438
Test data set size = 1860

d=1 on run 12
Train data set size = 7438
Test data set size = 1860

d=1 on run 13
Train data set size = 7438
Test data set size = 1860

d=1 

In [13]:
import pandas as pd

table_1a = pd.DataFrame({
    'mean train': [str(f'{x:.3f}') + u"\u00B1" + str(f'{y:.3f}') for (x, y) in zip(means_train, stds_train)],
    'mean test': [str(f'{x:.3f}') + u"\u00B1" + str(f'{y:.3f}') for (x, y) in zip(means_test, stds_test)],
})

display(table_1a)
table_1a.to_csv('table_1a.csv')
table_1a.style.to_latex('table_1a.tex')

Unnamed: 0,mean train,mean test
0,7.267±1.561,9.374±1.434
1,0.436±0.234,3.696±0.399
2,0.111±0.054,2.798±0.405
3,0.028±0.012,2.788±0.386
4,0.233±0.639,3.008±0.621
5,0.090±0.297,2.685±0.480
6,0.024±0.011,2.656±0.352


In [31]:
def cross_validation(X, y, n_folds):
    fold_size = X.shape[0] // n_folds

    split_idxs = [i * fold_size - 1 for i in range(1, 5)]
    X_folds_list = np.split(X, indices_or_sections=split_idxs)
    y_folds_list = np.split(y, indices_or_sections=split_idxs)
    assert len(X_folds_list) == len(y_folds_list) == n_folds

    errors_d = []
    for d in range(3, 8):
        local_errors = []
        for i in range(n_folds):
            # Create a training and test folds from given data
            X_train_fold = np.vstack(([X_folds_list[k] for k in range(0, n_folds) if k != i]))
            y_train_fold = np.concatenate([y_folds_list[k] for k in range(0, n_folds) if k != i], axis=0)
            X_validation_fold = X_folds_list[i]
            y_validation_fold = y_folds_list[i]

            kp = KPOneVsAllClassifier(n_classes=10, d=d)
            kp.fit(X_train_fold, y_train_fold, n_epochs=7)
            y_preds = kp.predict(X_validation_fold)

            # find the hardest points to predict
            train_diffs = y_validation_fold - y_preds
            for pos in np.where(train_diffs!=0):
                mistakes[pos] += 1

            local_errors.append(get_error_percentage(y_validation_fold, y_preds))
        errors_d.append(local_errors)

    # +4 because we consider only ds starting from 4
    return np.argmax(np.mean(errors_d, axis=1)) + 3