In [6]:
import numpy as np

training_data = np.genfromtxt('features.train', dtype=np.float)
test_data = np.genfromtxt('features.test', dtype=np.float)

train_features = training_data[:,1:3]
train_labels = training_data[:,0]
test_features = test_data[:,1:3]
test_labels = test_data[:,0]

def n_vs_all(labels, n):
    nlabels = np.copy(labels)
    nlabels[labels == n] = 1
    nlabels[labels != n] = -1
    return nlabels

def filter_m_vs_n(features, labels, m, n):
    mids = np.where(labels == m)
    nids = np.where(labels == n)
    mlabels = 1*np.ones(len(mids[0]))
    nlabels = -1*np.ones(len(nids[0]))
    return np.vstack((features[mids], features[nids])), np.hstack((mlabels, nlabels))

In [7]:
from sklearn import svm

def question2():
    for i in range(0,9,2):
        labels = n_vs_all(train_labels, i)
        clf = svm.SVC(C=0.01, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
        clf.fit(train_features, labels)
        print(i, 1.0 - clf.score(train_features, labels))

question2()

0 0.105883966534
2 0.100260595254
4 0.0894253188863
6 0.0910711836511
8 0.0743382252092


In [3]:
from sklearn import svm

def question3():
    for i in range(1,10,2):
        labels = n_vs_all(train_labels, i)
        clf = svm.SVC(C=0.01, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
        clf.fit(train_features, labels)
        print(i, 1.0 - clf.score(train_features, labels))

question3()

1 0.0144013166918
3 0.0902482512687
5 0.0762584007681
7 0.0884652311068
9 0.0883280757098


In [4]:
from sklearn import svm

def question4():
    labels0 = n_vs_all(train_labels, 0)
    labels1 = n_vs_all(train_labels, 1)
    clf = svm.SVC(C=0.01, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
    clf.fit(train_features, labels0)
    len0 = len(clf.support_)
    clf.fit(train_features, labels1)
    len1 = len(clf.support_)
    return abs(len0 - len1)

question4()

1793

In [8]:
from sklearn import svm

def question5():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    t_features, t_labels = filter_m_vs_n(test_features, test_labels, 1, 5)
    for c in [0.001,0.01,0.1,1]:
        clf = svm.SVC(C=c, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
        clf.fit(features, labels)
        print("C:",c)
        print("# of sv:", len(clf.support_))
        print("E_in:", 1 - clf.score(features, labels))
        print("E_out:", 1 - clf.score(t_features, t_labels))
        print("---")

question5()

C: 0.001
# of sv: 76
E_in: 0.00448430493274
E_out: 0.0165094339623
---
C: 0.01
# of sv: 34
E_in: 0.00448430493274
E_out: 0.0188679245283
---
C: 0.1
# of sv: 24
E_in: 0.00448430493274
E_out: 0.0188679245283
---
C: 1
# of sv: 24
E_in: 0.00320307495195
E_out: 0.0188679245283
---


In [8]:
from sklearn import svm

def question6():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    t_features, t_labels = filter_m_vs_n(test_features, test_labels, 1, 5)
    for c in [0.0001, 0.001, 0.01, 1]:
        for q in [2, 5]:
            clf = svm.SVC(C=c, kernel="poly", degree=q, gamma=1.0, coef0=1.0)
            clf.fit(features, labels)
            print("C:",c, "Q:",q)
            print("# of sv:", len(clf.support_))
            print("E_in:", 1 - clf.score(features, labels))
            print("E_out:", 1 - clf.score(t_features, t_labels))
            print("---")

question6()

C: 0.0001 Q: 2
# of sv: 236
E_in: 0.00896860986547
E_out: 0.0165094339623
---
C: 0.0001 Q: 5
# of sv: 26
E_in: 0.00448430493274
E_out: 0.0188679245283
---
C: 0.001 Q: 2
# of sv: 76
E_in: 0.00448430493274
E_out: 0.0165094339623
---
C: 0.001 Q: 5
# of sv: 25
E_in: 0.00448430493274
E_out: 0.0212264150943
---
C: 0.01 Q: 2
# of sv: 34
E_in: 0.00448430493274
E_out: 0.0188679245283
---
C: 0.01 Q: 5
# of sv: 23
E_in: 0.00384368994234
E_out: 0.0212264150943
---
C: 1 Q: 2
# of sv: 24
E_in: 0.00320307495195
E_out: 0.0188679245283
---
C: 1 Q: 5
# of sv: 21
E_in: 0.00320307495195
E_out: 0.0212264150943
---


In [25]:
# for sklearn <= 17.1
from sklearn.cross_validation import KFold
from sklearn import svm

def question7():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    dist = {}
    for _ in range(100):
        kfold = KFold(len(features), 10, shuffle=True)
        err_cnts = []
        for c in [0.0001, 0.001, 0.01, 0.1, 1]:
            err_cnt = 0
            for train,valid in kfold:
                clf = svm.SVC(C=c, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
                clf.fit(features[train], labels[train])
                err_cnt += np.sum(clf.predict(features[valid]) != labels[valid])
            err_cnts.append(err_cnt)
        winner = np.argmin(err_cnts)
        dist[winner] = dist.get(winner, 0) + 1
    print(dist)

question7()

{1: 48, 2: 27, 3: 10, 4: 15}


In [24]:
# for sklearn >= 18.1
from sklearn.model_selection import KFold
from sklearn import svm

def question7_2():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    dist = {}
    for _ in range(100):
        kfold = list(KFold(10, shuffle=True).split(features))
        err_cnts = []
        for c in [0.0001, 0.001, 0.01, 0.1, 1]:
            err_cnt = 0
            for train, valid in kfold:
                clf = svm.SVC(C=c, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
                clf.fit(features[train], labels[train])
                err_cnt += np.sum(clf.predict(features[valid]) != labels[valid])
            err_cnts.append(err_cnt)
        winner = np.argmin(err_cnts)
        dist[winner] = dist.get(winner, 0) + 1
    print(dist)

question7_2()

{1: 61, 2: 19, 3: 13, 4: 7}


In [26]:
# for sklearn >= 18.1
from numpy.random import randint
from sklearn.model_selection import KFold
from sklearn import svm

def question7_3():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    dist = {}
    for _ in range(100):
        rs = randint(4294967295+1)
        kfold = KFold(10, shuffle=True, random_state=rs)
        err_cnts = []
        for c in [0.0001, 0.001, 0.01, 0.1, 1]:
            err_cnt = 0
            for train, valid in kfold.split(features):
                clf = svm.SVC(C=c, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
                clf.fit(features[train], labels[train])
                err_cnt += np.sum(clf.predict(features[valid]) != labels[valid])
            err_cnts.append(err_cnt)
        winner = np.argmin(err_cnts)
        dist[winner] = dist.get(winner, 0) + 1
    print(dist)

question7_3()

{1: 58, 2: 21, 3: 11, 4: 10}


In [14]:
from numpy.random import randint
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn import svm

def question7_4():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    dist = {}
    for idx in range(100):
        scores = []
        rs = randint(4294967295+1)
        cv = ShuffleSplit(random_state=rs)
        for c in [0.0001, 0.001, 0.01, 0.1, 1]:
            clf = svm.SVC(C=c, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
            score = cross_val_score(clf, features, labels, cv=cv)
            scores.append(1 - np.mean(score))
        winner = np.argmin(scores)
        dist[winner] = dist.get(winner, 0) + 1
    print(dist)

question7_4()

{1: 46, 2: 23, 3: 10, 4: 21}


In [28]:
# for sklearn >= 18.1
from sklearn.model_selection import KFold
from sklearn import svm

def question8():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    err_cnts = []
    clf = svm.SVC(C=0.001, kernel="poly", degree=2, gamma=1.0, coef0=1.0)
    for _ in range(100):
        kfold = list(KFold(10, shuffle=True).split(features))
        err_cnt = 0
        for train,valid in kfold:
            clf.fit(features[train], labels[train])
            err_cnt += ( np.sum(clf.predict(features[valid]) != labels[valid]))
        err_cnts.append(err_cnt)
    print(np.mean(err_cnts) / len(labels))

question8()

0.00479180012812


In [29]:
from sklearn import svm

def question9and10():
    features, labels = filter_m_vs_n(train_features, train_labels, 1, 5)
    t_features, t_labels = filter_m_vs_n(test_features, test_labels, 1, 5)
    for c in [0.01, 1, 100, 1e4, 1e6]:
        clf = svm.SVC(C=c, kernel="rbf", gamma=1.0)
        clf.fit(features, labels)
        print("C:",c)
        print("# of sv:", len(clf.support_))
        print("E_in:", 1 - clf.score(features, labels))
        print("E_out:", 1 - clf.score(t_features, t_labels))
        print("---")

question9and10()

C: 0.01
# of sv: 406
E_in: 0.00384368994234
E_out: 0.0235849056604
---
C: 1
# of sv: 31
E_in: 0.00448430493274
E_out: 0.0212264150943
---
C: 100
# of sv: 22
E_in: 0.00320307495195
E_out: 0.0188679245283
---
C: 10000.0
# of sv: 19
E_in: 0.00256245996156
E_out: 0.0235849056604
---
C: 1000000.0
# of sv: 17
E_in: 0.000640614990391
E_out: 0.0235849056604
---
