In [1]:
# prepare data
from sklearn.preprocessing import scale

import pandas as pd

data = pd.read_csv("train.csv")
X = scale(data.iloc[:, 1:])
Y = data.iloc[:, 0]

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


def knn_accuracy(k, n_components):
    """
    Helper method, creates the knn classifier, trains it using our test data and return its accuracy
    """
    pca = PCA(n_components=n_components)
    pca.fit(X)
    X_ = pca.transform(X)

    # split data into train data and test data
    x_train, x_test, y_train, y_test = train_test_split(X_, Y, random_state=0)

    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)  # train model
    y_pred = knn.predict(x_test)  # predict
    return accuracy_score(y_test, y_pred)

In [8]:
import itertools

k_range = range(3, 9, 2)  # each odd number between 1 and 49
n_components = [1, 10, 20, 50, 70, 100, 200]
pairs = list(itertools.product(k_range, n_components))
accuracies = [knn_accuracy(k, n_components=n) for k, n in pairs]
acc_df = pd.DataFrame(pairs, columns=["K neighbors", "PCA N Components"])
acc_df["Accuracy"] = accuracies
acc_df.to_csv("knn.csv")

In [4]:
from sklearn import svm

def svm_accuracy(kernel, n_components):
    """
    Helper method, creates the svm classifier, trains it using our test data and return its accuracy
    """
    pca = PCA(n_components=n_components)
    pca.fit(X)
    X_ = pca.transform(X)
    
    x_train, x_test, y_train, y_test = train_test_split(X_, Y, random_state=0)

    clf = svm.SVC(kernel=kernel)
    clf.fit(x_train, y_train)  # train model
    y_pred = clf.predict(x_test)  # predict
    return accuracy_score(y_test, y_pred)


kernels = ["linear", "poly", "rbf", "sigmoid"]
n_components = [10,20,50,100]
pairs = list(itertools.product(kernels, n_components))
accuracies = [svm_accuracy(k, n_components=n) for k, n in pairs]
acc_df = pd.DataFrame(pairs, columns=["Kernel", "PCA N Components"])
acc_df["Accuracy"] = accuracies
acc_df

[kernel=linear, n=10, acc=0.8445714285714285]
[kernel=linear, n=20, acc=0.9001904761904762]
[kernel=linear, n=50, acc=0.9277142857142857]
[kernel=linear, n=100, acc=0.937047619047619]
[kernel=poly, n=10, acc=0.9053333333333333]
[kernel=poly, n=20, acc=0.9459047619047619]
[kernel=poly, n=50, acc=0.9595238095238096]
[kernel=poly, n=100, acc=0.9575238095238096]
[kernel=rbf, n=10, acc=0.9235238095238095]
[kernel=rbf, n=20, acc=0.9584761904761905]
[kernel=rbf, n=50, acc=0.9652380952380952]
[kernel=rbf, n=100, acc=0.9658095238095238]
[kernel=sigmoid, n=10, acc=0.6293333333333333]
[kernel=sigmoid, n=20, acc=0.7166666666666667]
[kernel=sigmoid, n=50, acc=0.7983809523809524]
[kernel=sigmoid, n=100, acc=0.8438095238095238]


Unnamed: 0,Kernel,PCA N Components,Accuracy
0,linear,10,0.844571
1,linear,20,0.90019
2,linear,50,0.927714
3,linear,100,0.937048
4,poly,10,0.905333
5,poly,20,0.945905
6,poly,50,0.959524
7,poly,100,0.957524
8,rbf,10,0.923524
9,rbf,20,0.958476
