# Caltech Machine Learning Homework # 8

In [54]:
import numpy as np
from sklearn.linear_model import Perceptron
import random
import math
import matplotlib.pyplot as plt
from typing import List
from itertools import product
import scipy.special
from scipy import optimize
import scipy.optimize as spo
from sympy import Symbol, Derivative
import functools
from sklearn import svm, model_selection
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold

def dbg():
    import pdb; pdb.set_trace()

Instructions: https://work.caltech.edu/homework/hw8.pdf

Answers: http://work.caltech.edu/homework/hw8_sol.pdf

## Primal vs Dual Problem

![](imgs/primalvsdual.png)

The original formulation was

![](imgs/primalvsdual2.png)

Which looks like **[b]** to me because of $\textbf x_{n}$ terms and $b$

But thats' wrong!

From Wikipedia:

![](imgs/quadprog.png)

So the variables are really the $\textbf w$ terms here, subject to linear contraints involving $\textbf x_{n}$. Hence it's a quadratic programming problem with $d+1$ variables ($+1$ since the term $\textbf b$ appears in the constraints)

This means that the original formulation depends on the input space size $d$, whereas the dual formulation only depends on $N$!

## SVM with Soft Margin

![](imgs/softmargsvm.png)

In [10]:
train = np.loadtxt('data/hw8/features.train.txt')
test = np.loadtxt('data/hw8/features.test.txt')

X_train = train[:,1:]
Y_train = train[:,0]
N_train = X_train[:, 0].size

X_test = test[:,1:]
Y_test = test[:,0]
N_test = X_test[:, 0].size

## Polynomial Kernels

![](imgs/polykern1.png)

In [15]:
C = .01
Q = 2

def make_binary(Ys, classToKeep):
    return np.array([1 if y == classToKeep else -1 for y in Ys])

def score(y_class):
    # Set all other labels to 0 to make it a binary y_class vs all classification
    bin_Y_train = make_binary(Y_train, y_class)
    bin_Y_test = make_binary(Y_test, y_class)
    
    clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)
    clf.fit(X_train, bin_Y_train)
    print(f"Score of {y_class} versus all was {clf.score(X_test, bin_Y_test)}")
    
score(0)
score(2)
score(4)
score(6)
score(8)

Score of 0 versus all was 0.8883906327852517
Score of 2 versus all was 0.9013452914798207
Score of 4 versus all was 0.9003487792725461
Score of 6 versus all was 0.9152964623816642
Score of 8 versus all was 0.9172894867962132


So the highest $E_{in}$ was at **[a]** 0 versus all

![](imgs/polykern2.png)

In [25]:
score(1)
score(3)
score(5)
score(7)
score(9)

Score of 1 versus all was 0.9780767314399601
Score of 3 versus all was 0.9172894867962132
Score of 5 versus all was 0.9202790234180369
Score of 7 versus all was 0.9267563527653214
Score of 9 versus all was 0.9118086696562033


So the lowest $E_{in}$ was at **[a]** 1 versus all

0s are hardest to detect, 1s easiest.

![](imgs/polykern3.png)

In [23]:
def get_num_svs(y_class):
    # Set all other labels to 0 to make it a binary y_class vs all classification
    bin_Y_train = make_binary(Y_train, y_class)
    bin_Y_test = make_binary(Y_test, y_class)
    
    clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)
    clf.fit(X_train, bin_Y_train)
    return sum(clf.n_support_)

get_num_svs(0) - get_num_svs(1)

1793

That's closest by **[c]**

![](imgs/polykern4.png)

In [14]:
Q = 2
Cs = [.001, .01, .1, 1]

def get_1v1_data(data, fst, snd):
    X,Y = data
    idxs = (Y == fst) | (Y == snd)
    return (X[idxs], Y[idxs])
    

def score_1v5(C):
    # Keep only data for 1s and 5s
    X_test1v5,Y_test1v5 = get_1v1_data((X_test, Y_test), 1, 5)
    X_train1v5, Y_train1v5 = get_1v1_data((X_train, Y_train), 1, 5)
    
    clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)
    clf.fit(X_train1v5, Y_train1v5)
    print(f"# support vectors for 1 vs 5 with C={C} was {sum(clf.n_support_)}")
    print(f"E_in for 1 vs 5 with C={C} was {clf.score(X_test1v5, Y_test1v5)}")
    print()

[score_1v5(C) for C in Cs]
    

# support vectors for 1 vs 5 with C=0.001 was 76
E_in for 1 vs 5 with C=0.001 was 0.9834905660377359

# support vectors for 1 vs 5 with C=0.01 was 34
E_in for 1 vs 5 with C=0.01 was 0.9811320754716981

# support vectors for 1 vs 5 with C=0.1 was 24
E_in for 1 vs 5 with C=0.1 was 0.9811320754716981

# support vectors for 1 vs 5 with C=1 was 24
E_in for 1 vs 5 with C=1 was 0.9811320754716981



[None, None, None, None]

**[a]** is almost true but not strictly so

**[b]** is false

**[c]** we don't know since we only have a bound on $E_{out}$, plus the bound is not going down strictly anyways

**[d]** is a bit hard to tell because the $E_{in}$ values are equal for $C={0.01, 0.1, 1}$ but I would say yes this is true,

it also makes intuitive sense because higher C means less regularization,

**[d]** is correct

![](imgs/polykern5.png)

In [18]:
Qs = [2, 5]
Cs = [.0001, .001, .01, 1]
    

def score_1v5(C, Q):
    # Keep only data for 1s and 5s
    X_test1v5,Y_test1v5 = get_1v1_data((X_test, Y_test), 1, 5)
    X_train1v5, Y_train1v5 = get_1v1_data((X_train, Y_train), 1, 5)
    
    clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)
    clf.fit(X_train1v5, Y_train1v5)
    print(f"# support vectors with C={C} at Q={Q} was {sum(clf.n_support_)}")
    print(f"E_in with C={C} at Q={Q} was {clf.score(X_test1v5, Y_test1v5)}")
    print()

[score_1v5(C, Q) for C in Cs for Q in Qs]

# support vectors with C=0.0001 at Q=2 was 236
E_in with C=0.0001 at Q=2 was 0.9834905660377359

# support vectors with C=0.0001 at Q=5 was 26
E_in with C=0.0001 at Q=5 was 0.9811320754716981

# support vectors with C=0.001 at Q=2 was 76
E_in with C=0.001 at Q=2 was 0.9834905660377359

# support vectors with C=0.001 at Q=5 was 25
E_in with C=0.001 at Q=5 was 0.9787735849056604

# support vectors with C=0.01 at Q=2 was 34
E_in with C=0.01 at Q=2 was 0.9811320754716981

# support vectors with C=0.01 at Q=5 was 23
E_in with C=0.01 at Q=5 was 0.9787735849056604

# support vectors with C=1 at Q=2 was 24
E_in with C=1 at Q=2 was 0.9811320754716981

# support vectors with C=1 at Q=5 was 21
E_in with C=1 at Q=5 was 0.9787735849056604



[None, None, None, None, None, None, None, None]

**[a]**: $E_{in}$ is lower at $Q=5$, which makes sense, so False.

**[b]**: Yes, the number of support vectors is actually lower here at $Q=5$. I have no idea why that is, but **[b]** seems correct!

## X-Validation

![](imgs/xvalidation1.png)

In [29]:
Q = 2
RUNS = 100

X_test1v5,Y_test1v5 = get_1v1_data((X_test, Y_test), 1, 5)
X_train1v5, Y_train1v5 = get_1v1_data((X_train, Y_train), 1, 5)

Cs = [.0001, .001, .01, .1, 1]

def x_validate(C):
    clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)
    scores = model_selection.cross_val_score(clf, X_train1v5, Y_train1v5, cv=10)
    return scores.mean()

# We're being lazy and calculating the expected value of E_CV intead of selecting after each run, let's see if that works
E_cv_a = np.array([x_validate(Cs[0]) for _ in range(RUNS)]).mean()
print(f"E_CV for [a] is {E_cv_a}.")
E_cv_b = np.array([x_validate(Cs[1]) for _ in range(RUNS)]).mean()
print(f"E_CV for [b] is {E_cv_b}.")
E_cv_c = np.array([x_validate(Cs[2]) for _ in range(RUNS)]).mean()
print(f"E_CV for [c] is {E_cv_c}.")
E_cv_d = np.array([x_validate(Cs[3]) for _ in range(RUNS)]).mean()
print(f"E_CV for [d] is {E_cv_d}.")
E_cv_e = np.array([x_validate(Cs[4]) for _ in range(RUNS)]).mean()
print(f"E_CV for [e] is {E_cv_e}.")






E_CV for [a] is 0.9897517556753229.
E_CV for [b] is 0.994871794871795.
E_CV for [c] is 0.99551282051282.
E_CV for [d] is 0.994871794871795.
E_CV for [e] is 0.99551282051282.


This indicates **[a]** as the clear winner, if taking expected value is indeed okay here!

Looks like that's not okay though. Let's do it properly:

In [42]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=100)
winners = []
for train_idx, val_idx in rskf.split(X_train1v5, Y_train1v5):
    X_train, X_val = X_train1v5[train_idx], X_train1v5[val_idx]
    Y_train, Y_val = Y_train1v5[train_idx], Y_train1v5[val_idx]
    
    bestScore = 0
    winner = None
    for C in Cs:
        clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)
        clf.fit(X_train, Y_train)
        score = clf.score(X_val, Y_val)
        if score > bestScore:
            bestScore = score
            winner = C
            
    winners.append(winner)

Cs, counts = np.unique(winners, return_counts=True)
print("Cs: ", Cs)
print("Counts: ", counts)

Cs:  [1.e-04 1.e-03 1.e-02 1.e-01 1.e+00]
Counts:  [377 512  41  27  43]


So **[b]** is the winner under the correct heuristic!

![](imgs/xvalidation2.png)

In [56]:
C = 1e-3
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=100)
clf = svm.SVC(kernel='poly', C=C, degree=Q, gamma=1.0, coef0=1.0, cache_size=20000)

scores = model_selection.cross_val_score(clf, X_train1v5, Y_train1v5, cv=rskf)
1 - scores.mean()

0.004766536011758915

That's closest to **[c]**

## RBF Kernel

![](imgs/rbf1.png)

In [65]:
Cs = [0.01, 1, 100, 1e4, 1e6]

X_test1v5,Y_test1v5 = get_1v1_data((X_test, Y_test), 1, 5)
X_train1v5, Y_train1v5 = get_1v1_data((X_train, Y_train), 1, 5)

def compute_eIN(C):
    clf = svm.SVC(kernel='rbf', C=C, degree=Q, gamma=1.0, cache_size=20000)
    clf.fit(X_train1v5, Y_train1v5)
    return 1 - clf.score(X_train1v5, Y_train1v5)
    
print([compute_eIN(C) for C in Cs])

[0.004270462633451988, 0.004982206405693912, 0.003558718861209953, 0.002846975088967918, 0.000711743772242035]


That would be **[e]**, or the lowest regularization (least soft version). Which makes sense.

![](imgs/rbf2.png)

In [64]:
Cs = [0.01, 1, 100, 1e4, 1e6]

X_test1v5,Y_test1v5 = get_1v1_data((X_test, Y_test), 1, 5)
X_train1v5, Y_train1v5 = get_1v1_data((X_train, Y_train), 1, 5)

def compute_eOUT(C):
    clf = svm.SVC(kernel='rbf', C=C, degree=Q, gamma=1.0, cache_size=20000)
    clf.fit(X_train1v5, Y_train1v5)
    return 1 - clf.score(X_test1v5, Y_test1v5)
    
print([compute_eOUT(C) for C in Cs])

[0.02358490566037741, 0.021226415094339646, 0.018867924528301883, 0.02358490566037741, 0.02358490566037741]


The answer is **[c]**