This is the second week's assignment which requires me to complete kernel nearest classifier manually by myself.

In [None]:
# ------------------------------------------------------------------------- #
#                         Kernel Nearest Classifier                         #
# ------------------------------------------------------------------------- #

# -*- coding: utf-8 -*-
# @Time    : 2018/1/15 19:51
# @Author  : Jiahao Yang
# @Email   : yangjh39@uw.edu

# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Kernel functions (Reguler version)
def gaussianRBF(x, y, sigma2):

    # x and y are vectors with length n
    # sigma2 is the hyper-parameter for the kernel

    return np.exp(-(x-y).pow(2).sum() / sigma2)

def polynomialVovk(x, y, p):

    # x and y are vectors with length n
    # p is the hyper-parameter for the kernel

    return sum((1 - ((x*y).sum())**p) / (1 - (x*y).sum()))

# Kernel functions (Matrix version)
def gaussianRBFm(x, y, sigma2):

    # x is a n*m dataframs of n samples with m features and y is a 1*m sample vector
    # sigma2 is hyper-parameter
    # return a n*1 kernel vector

    return sum(np.exp(-(x-y).pow(2).sum(1) / sigma2))

def polynomialVovkm(x, y, p):

    return sum((1 - (np.dot(x, y))**p) / (1 - np.dot(x, y)))

# Data preprocessing
def preprocessing(dat):
    dat.iloc[:, 0:-1] = (dat.iloc[:, 0:-1] - dat.iloc[:, 0:-1].mean(0)) / (np.std(dat.iloc[:, 0:-1]))
    dat.iloc[:, -1] = dat.iloc[:, -1].where(dat.iloc[:, -1] == 1, -1)

def modeltraining(traindata, kernelfunc=gaussianRBFm, hyperpara=1):

    if traindata is None:
        return 0

    class1data = traindata.iloc[np.where(traindata.iloc[:, -1] == 1)[0], 0:-1]
    class2data = traindata.iloc[np.where(traindata.iloc[:, -1] == -1)[0], 0:-1]

    n = len(np.where(traindata.iloc[:, -1] == 1)[0])
    m = len(np.where(traindata.iloc[:, -1] == -1)[0])

    train1andtrain1 = 0

    for j3 in np.arange(n):
        onetrain1data = class1data.iloc[j3, :]

        train1andtrain1 += kernelfunc(class1data, onetrain1data, hyperpara)

    train1andtrain1 *= 1 / (n ** 2)

    train2andtrain2 = 0

    for j4 in np.arange(m):
        onetrain2data = class2data.iloc[j4, :]

        train2andtrain2 += kernelfunc(class2data, onetrain2data, hyperpara)

    train2andtrain2 *= 1 / (m ** 2)

    return train1andtrain1 - train2andtrain2

# Nearest-mean classifier
def nearestMeanClassifier(traindata, testdata, trainresult, kernelfunc=gaussianRBFm, hyperpara=1):

    # The default kernel function is Gaussian kernel with hyper-parameter 1

    predictlabel = []

    if testdata is None:
        return predictlabel

    class1data = traindata.iloc[np.where(traindata.iloc[:, -1] == 1)[0], 0:-1]
    class2data = traindata.iloc[np.where(traindata.iloc[:, -1] == -1)[0], 0:-1]

    n = len(np.where(traindata.iloc[:, -1] == 1)[0])
    m = len(np.where(traindata.iloc[:, -1] == -1)[0])
    p = len(testdata.iloc[:, 0])

    for i in np.arange(p):
        onetestdata = testdata.iloc[i, 0:-1]

        testandtrain1 = kernelfunc(class1data, onetestdata, hyperpara)

        testandtrain1 *= 2 / n

        testandtrain2 = kernelfunc(class2data, onetestdata, hyperpara)

        testandtrain2 *= 2 / m

        predictlabel.append(1 if (testandtrain1 - testandtrain2 + trainresult) > 0 else -1)

    return np.array(predictlabel)


if __name__ == '__main__':
    # Load data and split data into train set and test set
    data = pd.read_table("spamdata.txt", sep=' ', header=None)
    indicator = pd.read_table("spamtraintest.txt", sep=' ', header=None)

    train = data.loc[np.where(indicator == 0)[0].tolist(), :]
    test = data.loc[np.where(indicator == 1)[0].tolist(), :]

    preprocessing(train)
    preprocessing(test)

    hyperparameterg = np.arange(1, 140, 7).tolist()
    hyperparameterp = np.arange(1, 21, 1).tolist()

    errorgaussian, errorpoly = [], []

    for i in np.arange(len(hyperparameterg)):

        trainresultgaussian = modeltraining(train, gaussianRBFm, hyperparameterg[i])
        trainresultpoly = modeltraining(train, polynomialVovkm, hyperparameterp[i])

        predictlabelgaussian = nearestMeanClassifier(train, test, trainresultgaussian, gaussianRBFm, hyperparameterg[i])
        predictlabelpoly = nearestMeanClassifier(train, test, trainresultpoly, polynomialVovkm, hyperparameterp[i])

        errorgaussian.append(sum(predictlabelgaussian != test.iloc[:, -1]) / len(test.iloc[:, -1]))
        errorpoly.append(sum(predictlabelpoly != test.iloc[:, -1]) / len(test.iloc[:, -1]))

        print("Gaussian Error is : " + str(errorgaussian[-1]))
        print("Poly Error is : " + str(errorpoly[-1]))
        print(str(len(hyperparameterg) - i - 1) + " iteration left")

    plt.figure(1)
    plt.plot(hyperparameterg, errorgaussian, 'ro-')
    plt.xlabel('hyper-parameter')
    plt.ylabel('misclassification error')
    plt.title('Misclassfication error using Gaussian kernel')

    plt.plot(hyperparameterp, errorpoly, 'ro-')
    plt.xlabel('hyper-parameter')
    plt.ylabel('misclassification error')
    plt.title("Misclassfication error using Vivk's polynomial kernel")

![title](exercise3.png)