In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time 
from sklearn.preprocessing import LabelBinarizer, StandardScaler

class normalizer():
    def __init__(self):
        self.mean = 0
        self.std = 0

    def fit(self, X):
        self.mean = np.mean(X, axis=0) # mean of each column vector
        self.std = np.std(X, axis=0) # std of each column vector
        self.std[self.std <= 1e-5] = 1

    def transform(self, X):
        """
            feature normalization. Each row of X represents a point in R^d. 
            Substract by the mean of X and then divided by the std of X.
        """
        return (X - self.mean)/self.std

def accuracy(ypred, yreal):
    return np.sum(ypred==yreal)/float(len(yreal))

class SVM():
    """
        simple smo
    """
    def __init__(self, iterations=500, kernel="linear", C=1.0, tol = 0.001, gamma = "auto", degree = 3, coef0=0.0, lam=1/200, lr=0.03):
        self.kernels = {
            "linear" : self.linear,
            "poly" : self.poly,
            "rbf" : self.rbf,
            "sigmoid" : self.sigmoid
        }
        self.iterations = iterations
        self.kernel = self.kernels[kernel]
        self.C = C
        self.tol = tol
        self.gamma = gamma
        self.degree = degree
        self.coef0 = coef0
        self.lam = lam
        self.lr=lr

    def fit(self, X, y):
        """
            Implement gradient descent
            Loss = \sqrt{np.dot(c, c)} + lambda \sigma_i max(0, 1-y^i (np.dot(c, K(x^i))+b))
            where K(x^i) = (k(x^1, x^i), ..., k(x^M, x^i)).
            X: dataset without extended column of ones 
            y: target or label.
            Return 
            ------
            c: 1d array of length X.shape[0]
            b: a number
        """
        m = X.shape[0]
        if m != y.shape[0]:
            print("Error: Dimension of data and target don't match.")
            return None
        if self.gamma == "auto":
            self.gamma = 1/X.shape[1] # This is helpful for rbf and sigmoid.
        KerMat = self.kernel(X,X) # All product of xi and xj
        self.c = np.ones(m)
        self.b = 1
        for i in range(self.iterations):
            hinge_loss = 1 - y * (np.dot(KerMat, self.c)+self.b)
            hinge_loss[hinge_loss>1e-5] = 0
            dc = - self.lam * np.dot(y[hinge_loss==0], KerMat[hinge_loss==0])+ self.c/np.linalg.norm(self.c)
            db = - self.lam * np.sum(y[hinge_loss==0])
            self.c -= self.lr*dc
            self.b -= self.lr*db
        self.X = X
        
    def predict(self, Xtest, prob = False):
        
        ypred = np.zeros(Xtest.shape[0])
        for i in range(Xtest.shape[0]):
            ypred[i]=self.b + np.dot(self.kernel(Xtest[i], self.X), self.c)
        if prob == True:
            return ypred
        ypred[ypred>1e-5] = 1
        ypred[ypred<=1e-5] = -1
        return ypred

    def linear(self, u, v):
        """
            u, v might both be 2d array with the same shape[1].
        """
        return np.dot(u, v.T)

    def poly(self, u, v):
        """
            polynomial kernel (coef0 + \gamma u \cdot v)^d where d is degree. u and v are 1d vectors.
            u, v might both be 2d array with the same shape[1].
        """
        return (self.coef0 + self.gamma * np.dot(u, v.T))**self.degree

    def rbf(self, u, v):
        """
            u, v might both be 2d array with the same shape[1].
        """
        if u.ndim == 1 and v.ndim == 1:
            return np.exp(-self.gamma * np.sum((u-v)**2))
        elif (u.ndim ==1 and v.ndim != 1):
            return np.exp(-self.gamma * np.sum((u-v)**2, axis=1))
        elif (u.ndim!=1 and v.ndim ==1):
            return np.exp(-self.gamma * np.sum((u-v)**2, axis=1))       
        else: 
            res = np.zeros((u.shape[0], v.shape[0]))
            for i in range(u.shape[0]):
                res[i] = np.exp(-self.gamma * np.sum((u[i]-v)**2, axis=1))   
            return res

    def sigmoid(self, u, v):
        """
            u, v might both be 2d array with the same shape[1].
        """
        return np.tanh(self.gamma * np.dot(u, v.T) + self.coef0)

Xtrain = pd.read_csv("MNIST_X_train.csv").values
ytrain = pd.read_csv("MNIST_Y_train.csv").values
Xtest = pd.read_csv("MNIST_X_test.csv").values
ytest = pd.read_csv("MNIST_Y_test.csv").values

print("The shape of Xtrain is {}".format(Xtrain.shape))
print("The shape of ytrain is {}".format(ytrain.shape))
print("The shape of Xtest is {}".format(Xtest.shape))
print("The shape of ytest is {}".format(ytest.shape))

ytrain, ytest = ytrain.flatten(), ytest.flatten()

The shape of Xtrain is (2000, 784)
The shape of ytrain is (2000, 1)
The shape of Xtest is (500, 784)
The shape of ytest is (500, 1)


In [68]:
lb = LabelBinarizer(neg_label=-1)
lb.fit(ytrain)
ytrain_ohe = lb.transform(ytrain)
ytest_ohe  = lb.transform(ytest)

In [75]:
# Feature scaling
scaler = normalizer()
scaler.fit(Xtrain)
normalized_Xtrain = scaler.transform(Xtrain)
normalized_Xtest = scaler.transform(Xtest)

start = time.time()
# linear kernel
# one vs one approach
labels = np.zeros((Xtest.shape[0], 10))
for i in range(10):
    for j in range(10):
        if j > i:
            # prepare data and target for i vs j
            data = normalized_Xtrain[(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            target = ytrain_ohe[:,i][(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            # Train class i vs class j
            clf = SVM(kernel = "linear", iterations = 300, lr=0.003)
            clf.fit(data, target)
            # compute training accuracy
            predLabels = clf.predict(data)
            score = accuracy(target, predLabels)
            print("Training class {} vs class {} is complete. The training accuracy is {:.2f}%".format(i,j,score*100))

            pred = clf.predict(normalized_Xtest)
            labels[:, i][pred==1] += 1
            labels[:, j][pred==-1] += 1
            
ypred = np.argmax(labels, axis=1)
end = time.time()

score = accuracy(ytest, ypred)
print("Using linear kernel, the accuracy of multiclass classification is {:.2f}%".format(score*100)) 
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs class 1 is complete. The training accuracy is 100.00%
Training class 0 vs class 2 is complete. The training accuracy is 99.76%
Training class 0 vs class 3 is complete. The training accuracy is 99.51%
Training class 0 vs class 4 is complete. The training accuracy is 100.00%
Training class 0 vs class 5 is complete. The training accuracy is 99.49%
Training class 0 vs class 6 is complete. The training accuracy is 100.00%
Training class 0 vs class 7 is complete. The training accuracy is 99.28%
Training class 0 vs class 8 is complete. The training accuracy is 100.00%
Training class 0 vs class 9 is complete. The training accuracy is 99.23%
Training class 1 vs class 2 is complete. The training accuracy is 98.44%
Training class 1 vs class 3 is complete. The training accuracy is 98.64%
Training class 1 vs class 4 is complete. The training accuracy is 99.54%
Training class 1 vs class 5 is complete. The training accuracy is 99.76%
Training class 1 vs class 6 is complete. The tr

In [76]:
# A more concise version of one vs one classification
# Feature scaling
scaler = normalizer()
scaler.fit(Xtrain)
normalized_Xtrain = scaler.transform(Xtrain)
normalized_Xtest = scaler.transform(Xtest)

start = time.time()
# poly kernel
# one vs one approach
labels = np.zeros((Xtest.shape[0], 10))
for i in range(10):
    for j in range(10):
        if j > i:
            # prepare data and target for i vs j
            data = normalized_Xtrain[(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            target = ytrain_ohe[:,i][(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            # Train class i vs class j
            clf = SVM(kernel = "poly", iterations = 400,gamma = 1,degree=3, lr=0.01)
            clf.fit(data, target)
            # compute training accuracy
            predLabels = clf.predict(data)
            score = accuracy(target, predLabels)
            print("Training class {} vs class {} is complete. The training accuracy is {:.2f}%".format(i,j,score*100))

            pred = clf.predict(normalized_Xtest)
            labels[:, i][pred==1] += 1
            labels[:, j][pred==-1] += 1
            
ypred = np.argmax(labels, axis=1)
end = time.time()

score = accuracy(ytest, ypred)
print("Using poly kernel, the accuracy of multiclass classification is {:.2f}%".format(score*100)) 
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs class 1 is complete. The training accuracy is 100.00%
Training class 0 vs class 2 is complete. The training accuracy is 99.05%
Training class 0 vs class 3 is complete. The training accuracy is 99.27%
Training class 0 vs class 4 is complete. The training accuracy is 100.00%
Training class 0 vs class 5 is complete. The training accuracy is 99.23%
Training class 0 vs class 6 is complete. The training accuracy is 99.25%
Training class 0 vs class 7 is complete. The training accuracy is 98.57%
Training class 0 vs class 8 is complete. The training accuracy is 99.21%
Training class 0 vs class 9 is complete. The training accuracy is 98.72%
Training class 1 vs class 2 is complete. The training accuracy is 100.00%
Training class 1 vs class 3 is complete. The training accuracy is 99.09%
Training class 1 vs class 4 is complete. The training accuracy is 97.24%
Training class 1 vs class 5 is complete. The training accuracy is 99.76%
Training class 1 vs class 6 is complete. The tra

In [77]:
scaler = normalizer()
scaler.fit(Xtrain)
normalized_Xtrain = scaler.transform(Xtrain)
normalized_Xtest = scaler.transform(Xtest)

start = time.time()
# rbf kernel
# one vs one approach
labels = np.zeros((Xtest.shape[0], 10))
for i in range(10):
    for j in range(10):
        if j > i:
            # prepare data and target for i vs j
            data = normalized_Xtrain[(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            target = ytrain_ohe[:,i][(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            # Train class i vs class j
            clf = SVM(kernel = "rbf", iterations = 2000, lr=0.01, lam=0.01)
            clf.fit(data, target)
            # compute training accuracy
            predLabels = clf.predict(data)
            score = accuracy(target, predLabels)
            print("Training class {} vs class {} is complete. The training accuracy is {:.2f}%".format(i,j,score*100))

            pred = clf.predict(normalized_Xtest)
            labels[:, i][pred==1] += 1
            labels[:, j][pred==-1] += 1
            
ypred = np.argmax(labels, axis=1)
end = time.time()

score = accuracy(ytest, ypred)
print("Using rbf kernel, the accuracy of multiclass classification is {:.2f}%".format(score*100)) 
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs class 1 is complete. The training accuracy is 97.74%
Training class 0 vs class 2 is complete. The training accuracy is 94.76%
Training class 0 vs class 3 is complete. The training accuracy is 94.16%
Training class 0 vs class 4 is complete. The training accuracy is 96.79%
Training class 0 vs class 5 is complete. The training accuracy is 93.06%
Training class 0 vs class 6 is complete. The training accuracy is 93.50%
Training class 0 vs class 7 is complete. The training accuracy is 96.18%
Training class 0 vs class 8 is complete. The training accuracy is 93.92%
Training class 0 vs class 9 is complete. The training accuracy is 95.91%
Training class 1 vs class 2 is complete. The training accuracy is 93.10%
Training class 1 vs class 3 is complete. The training accuracy is 91.14%
Training class 1 vs class 4 is complete. The training accuracy is 96.54%
Training class 1 vs class 5 is complete. The training accuracy is 93.54%
Training class 1 vs class 6 is complete. The traini

In [71]:
start = time.time()
# sigmoid kernel
# one vs one approach
labels = np.zeros((Xtest.shape[0], 10))
for i in range(10):
    for j in range(10):
        if j > i:
            # prepare data and target for i vs j
            data = normalized_Xtrain[(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            target = ytrain_ohe[:,i][(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            # Train class i vs class j
            clf = SVM(kernel = "sigmoid", iterations = 1000, lr=0.01, lam=0.01)
            clf.fit(data, target)
            # compute training accuracy
            predLabels = clf.predict(data)
            score = accuracy(target, predLabels)
            print("Training class {} vs class {} is complete. The training accuracy is {:.2f}%".format(i,j,score*100))

            pred = clf.predict(normalized_Xtest)
            labels[:, i][pred==1] += 1
            labels[:, j][pred==-1] += 1
            
ypred = np.argmax(labels, axis=1)
end = time.time()

score = accuracy(ytest, ypred)
print("Using sigmoid kernel, the accuracy of multiclass classification is {:.2f}%".format(score*100)) 
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs class 1 is complete. The training accuracy is 97.52%
Training class 0 vs class 2 is complete. The training accuracy is 93.33%
Training class 0 vs class 3 is complete. The training accuracy is 93.67%
Training class 0 vs class 4 is complete. The training accuracy is 94.57%
Training class 0 vs class 5 is complete. The training accuracy is 90.75%
Training class 0 vs class 6 is complete. The training accuracy is 93.00%
Training class 0 vs class 7 is complete. The training accuracy is 96.66%
Training class 0 vs class 8 is complete. The training accuracy is 91.27%
Training class 0 vs class 9 is complete. The training accuracy is 94.37%
Training class 1 vs class 2 is complete. The training accuracy is 96.88%
Training class 1 vs class 3 is complete. The training accuracy is 96.14%
Training class 1 vs class 4 is complete. The training accuracy is 95.62%
Training class 1 vs class 5 is complete. The training accuracy is 96.65%
Training class 1 vs class 6 is complete. The traini

In [78]:
# one vs all approach, linear kernel
start = time.time()
   
preds = np.zeros((Xtest.shape[0], 10))

for i in range(10):
    # Train class i vs rest
    clf = SVM(kernel = "linear", iterations = 1000, lr=0.01, lam=0.01)
    clf.fit(normalized_Xtrain, ytrain_ohe[:,i])
    preds[:, i] = clf.predict(normalized_Xtest, prob=True) # labels is going to be used for prediction on test data
    pred_labels = clf.predict(normalized_Xtrain) 
    pred_labels[pred_labels<1e-5] = -1
    pred_labels[pred_labels>=1e-5] = 1 # pred_labels are the labels predicted on training data
    # compute training accuracy
    score = accuracy(ytrain_ohe[:,i], pred_labels)
    print("Training class {} vs all is complete. The training accuracy is {:.2f}%".format(i, score*100))

ypred = np.argmax(preds, axis=1)

end = time.time()

score = accuracy(ytest, ypred)
print("The accuracy of multiclass classification is {:.2f}%".format(score*100))
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs all is complete. The training accuracy is 74.20%
Training class 1 vs all is complete. The training accuracy is 75.05%
Training class 2 vs all is complete. The training accuracy is 58.80%
Training class 3 vs all is complete. The training accuracy is 57.75%
Training class 4 vs all is complete. The training accuracy is 62.15%
Training class 5 vs all is complete. The training accuracy is 65.85%
Training class 6 vs all is complete. The training accuracy is 61.90%
Training class 7 vs all is complete. The training accuracy is 89.75%
Training class 8 vs all is complete. The training accuracy is 59.95%
Training class 9 vs all is complete. The training accuracy is 62.50%
The accuracy of multiclass classification is 69.80%
Takes 124.03 seconds.


In [81]:
# one vs all approach, poly kernel
start = time.time()
   
preds = np.zeros((Xtest.shape[0], 10))

for i in range(10):
    # Train class i vs rest
    clf = SVM(kernel = "poly", iterations = 2000, lr=0.01, lam=0.01, degree=2)
    clf.fit(normalized_Xtrain, ytrain_ohe[:,i])
    preds[:, i] = clf.predict(normalized_Xtest, prob=True) # labels is going to be used for prediction on test data
    pred_labels = clf.predict(normalized_Xtrain) 
    pred_labels[pred_labels<1e-5] = -1
    pred_labels[pred_labels>=1e-5] = 1 # pred_labels are the labels predicted on training data
    # compute training accuracy
    score = accuracy(ytrain_ohe[:,i], pred_labels)
    print("Training class {} vs all is complete. The training accuracy is {:.2f}%".format(i, score*100))

ypred = np.argmax(preds, axis=1)

end = time.time()

score = accuracy(ytest, ypred)
print("The accuracy of multiclass classification is {:.2f}%".format(score*100))
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs all is complete. The training accuracy is 95.35%
Training class 1 vs all is complete. The training accuracy is 96.30%
Training class 2 vs all is complete. The training accuracy is 93.35%
Training class 3 vs all is complete. The training accuracy is 92.65%
Training class 4 vs all is complete. The training accuracy is 91.70%
Training class 5 vs all is complete. The training accuracy is 92.60%
Training class 6 vs all is complete. The training accuracy is 94.40%
Training class 7 vs all is complete. The training accuracy is 94.30%
Training class 8 vs all is complete. The training accuracy is 91.75%
Training class 9 vs all is complete. The training accuracy is 91.50%
The accuracy of multiclass classification is 67.60%
Takes 220.54 seconds.


In [79]:
# one vs all approach, sigmoid kernel
start = time.time()
   
preds = np.zeros((Xtest.shape[0], 10))

for i in range(10):
    # Train class i vs rest
    clf = SVM(kernel = "sigmoid", iterations = 2000, lr=0.01, lam=0.01)
    clf.fit(normalized_Xtrain, ytrain_ohe[:,i])
    preds[:, i] = clf.predict(normalized_Xtest, prob=True) # labels is going to be used for prediction on test data
    pred_labels = clf.predict(normalized_Xtrain) 
    pred_labels[pred_labels<1e-5] = -1
    pred_labels[pred_labels>=1e-5] = 1 # pred_labels are the labels predicted on training data
    # compute training accuracy
    score = accuracy(ytrain_ohe[:,i], pred_labels)
    print("Training class {} vs all is complete. The training accuracy is {:.2f}%".format(i, score*100))

ypred = np.argmax(preds, axis=1)

end = time.time()

score = accuracy(ytest, ypred)
print("The accuracy of multiclass classification is {:.2f}%".format(score*100))
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs all is complete. The training accuracy is 98.55%
Training class 1 vs all is complete. The training accuracy is 99.05%
Training class 2 vs all is complete. The training accuracy is 97.45%
Training class 3 vs all is complete. The training accuracy is 96.80%
Training class 4 vs all is complete. The training accuracy is 98.00%
Training class 5 vs all is complete. The training accuracy is 97.00%
Training class 6 vs all is complete. The training accuracy is 97.80%
Training class 7 vs all is complete. The training accuracy is 97.85%
Training class 8 vs all is complete. The training accuracy is 96.05%
Training class 9 vs all is complete. The training accuracy is 96.30%
The accuracy of multiclass classification is 89.60%
Takes 161.21 seconds.
