In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time 
from sklearn.preprocessing import LabelBinarizer, StandardScaler

class normalizer():
    def __init__(self):
        self.mean = 0
        self.std = 0

    def fit(self, X):
        self.mean = np.mean(X, axis=0) # mean of each column vector
        self.std = np.std(X, axis=0) # std of each column vector
        self.std[self.std <= 1e-5] = 1

    def transform(self, X):
        """
            feature normalization. Each row of X represents a point in R^d. 
            Substract by the mean of X and then divided by the std of X.
        """
        return (X - self.mean)/self.std

def SGD(X, y, epochs, lr, lam):
    """
        Implement gradient descent.
        X: extended data. 
        lr: learning rate.
        lam: regularization parameter lambda
        c: parameter of linear classifier.
    """
    c = np.ones(X.shape[1])
    m = X.shape[0]
    # draw indices
    # stochastic gradient descent
    for epoch in range(epochs): 
        Xy = np.concatenate((X, y.reshape(-1,1)), axis=1)
        np.random.shuffle(Xy)
        X_shuffled, y_shuffled = Xy[:, :-1], Xy[:, -1]
        for idx in range(m):
            hinge_loss = np.max(1 - y_shuffled[idx] * np.dot(X_shuffled[idx], c), 0)
            grad = 2 * np.concatenate(([0], c[1:]), axis=None)
            if hinge_loss > 1e-5:
                grad = - m*lam * y_shuffled[idx]* X_shuffled[idx]
            c = c - lr * grad
        #if (epoch+1 % 100) == 0:
            #print("After {} epochs, cost is {}".format(epoch, cost(X, y, c, lam)))
    return c

def accuracy(ypred, yreal):
    return np.sum(ypred==yreal)/float(len(yreal))


Xtrain = pd.read_csv("MNIST_X_train.csv").values
ytrain = pd.read_csv("MNIST_Y_train.csv").values
Xtest = pd.read_csv("MNIST_X_test.csv").values
ytest = pd.read_csv("MNIST_Y_test.csv").values

print("The shape of Xtrain is {}".format(Xtrain.shape))
print("The shape of ytrain is {}".format(ytrain.shape))
print("The shape of Xtest is {}".format(Xtest.shape))
print("The shape of ytest is {}".format(ytest.shape))

ytrain, ytest = ytrain.flatten(), ytest.flatten()

The shape of Xtrain is (2000, 784)
The shape of ytrain is (2000, 1)
The shape of Xtest is (500, 784)
The shape of ytest is (500, 1)


In [2]:
lb = LabelBinarizer(neg_label=-1)
lb.fit(ytrain)
ytrain_ohe = lb.transform(ytrain)
ytest_ohe  = lb.transform(ytest)

In [12]:
# Feature scaling
scaler = normalizer()
scaler.fit(Xtrain)
normalized_Xtrain = scaler.transform(Xtrain)
normalized_Xtest = scaler.transform(Xtest)

extended_normalized_Xtrain = np.concatenate((np.ones((Xtrain.shape[0],1)), normalized_Xtrain), axis=1)
extended_normalized_Xtest = np.concatenate((np.ones((Xtest.shape[0],1)), normalized_Xtest), axis=1)

epochs = 100
lr = 0.0001
lam = 1/200 # lambda

start = time.time()
      
preds = np.zeros((Xtest.shape[0], 10))
# one vs all approach
for i in range(10):
    # Train class i vs rest
    params = SGD(extended_normalized_Xtrain, ytrain_ohe[:,i], epochs, lr, lam)
    preds[:, i] = np.dot(extended_normalized_Xtest, params) # labels is going to be used for prediction on test data
    pred_labels = np.dot(extended_normalized_Xtrain, params) 
    pred_labels[pred_labels<1e-5] = -1
    pred_labels[pred_labels>=1e-5] = 1 # pred_labels are the labels predicted on training data
    # compute training accuracy
    score = accuracy(ytrain_ohe[:,i], pred_labels)
    print("Training class {} vs all is complete. The training accuracy is {:.2f}%".format(i, score*100))

ypred = np.argmax(preds, axis=1)

end = time.time()

score = accuracy(ytest, ypred)
print("The accuracy of multiclass classification is {:.2f}%".format(score*100))
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs all is complete. The training accuracy is 98.95%
Training class 1 vs all is complete. The training accuracy is 99.40%
Training class 2 vs all is complete. The training accuracy is 98.10%
Training class 3 vs all is complete. The training accuracy is 97.50%
Training class 4 vs all is complete. The training accuracy is 98.40%
Training class 5 vs all is complete. The training accuracy is 97.75%
Training class 6 vs all is complete. The training accuracy is 98.65%
Training class 7 vs all is complete. The training accuracy is 98.00%
Training class 8 vs all is complete. The training accuracy is 96.70%
Training class 9 vs all is complete. The training accuracy is 97.20%
The accuracy of multiclass classification is 88.60%
Takes 69.89 seconds.


In [8]:
# A more concise version of one vs one classification
# Feature scaling
scaler = normalizer()
scaler.fit(Xtrain)
normalized_Xtrain = scaler.transform(Xtrain)
normalized_Xtest = scaler.transform(Xtest)

extended_normalized_Xtrain = np.concatenate((np.ones((Xtrain.shape[0],1)), normalized_Xtrain), axis=1)
extended_normalized_Xtest = np.concatenate((np.ones((Xtest.shape[0],1)), normalized_Xtest), axis=1)

epochs = 200
lr = 0.001
lam = 1/200 # lambda

start = time.time()
labels = np.zeros((Xtest.shape[0], 10))
# one vs one approach
for i in range(9):
    for j in range(10):
        if j > i:
            data = extended_normalized_Xtrain[(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            target = ytrain_ohe[:,i][(ytrain_ohe[:, i]==1)+(ytrain_ohe[:, j]==1)]
            # Train class i vs class j
            params = SGD(data, target, epochs, lr, lam)
            
            labels_training_sets = np.dot(data, params) 
            labels_training_sets[labels_training_sets >=1e-5] = 1
            labels_training_sets[labels_training_sets < 1e-5] = -1 # labels predicted on training sets
            # compute training accuracy
            score = accuracy(target, labels_training_sets)
            print("Training class {} vs class {} is complete. The training accuracy is {:.2f}%".format(i,j,score*100))
            
            pred = np.dot(extended_normalized_Xtest, params)
            labels[:, i][pred>=1e-5] += 1
            labels[:, j][pred<1e-5] += 1

ypred = np.argmax(labels, axis=1)
end = time.time()

score = accuracy(ytest, ypred)
print("The accuracy of multiclass classification is {:.2f}%".format(score*100))
print("Takes {:.2f} seconds.".format(end - start))

Training class 0 vs class 1 is complete. The training accuracy is 100.00%
Training class 0 vs class 2 is complete. The training accuracy is 99.29%
Training class 0 vs class 3 is complete. The training accuracy is 99.27%
Training class 0 vs class 4 is complete. The training accuracy is 100.00%
Training class 0 vs class 5 is complete. The training accuracy is 98.97%
Training class 0 vs class 6 is complete. The training accuracy is 100.00%
Training class 0 vs class 7 is complete. The training accuracy is 100.00%
Training class 0 vs class 8 is complete. The training accuracy is 98.68%
Training class 0 vs class 9 is complete. The training accuracy is 99.49%
Training class 1 vs class 2 is complete. The training accuracy is 98.89%
Training class 1 vs class 3 is complete. The training accuracy is 99.09%
Training class 1 vs class 4 is complete. The training accuracy is 99.77%
Training class 1 vs class 5 is complete. The training accuracy is 100.00%
Training class 1 vs class 6 is complete. The t