In [140]:
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.datasets import load_digits 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [1038]:
def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x))

def logPredict(X, w): 
    return sigmoid(np.dot(X,w))

def logAvgCost(X, y, w, regularization=None, lmda=0):
    predict = logPredict(X, w)
    cost = y * np.log(predict) + (1.0 - y) * np.log(1.0 - predict)
    cost = - cost 
    totalCost = cost.sum()
    if regularization == 'l1':
        totalCost += l1Cost(lmda, w)
    elif regularization == 'l2':
        totalCost += l2Cost(lmda, w)
    return totalCost / y.shape[0]

def l1Cost(lmda, w): 
    return lmda * np.sum(np.abs(w))

def l2Cost(lmda, w):
    return lmda * np.sum(np.square(w))

def l1Grad(lmda, w):
    gradient = lmda * np.sign(w)
    zeros = np.array(np.where(gradient == 0)).T
    for zero in zeros:
        gradient[zero] = (2.0 * np.random.rand()) - 1.0
    return gradient

def l2Grad(lmda, w): 
    return lmda * 2 * w

def logAvgGrad(X, y, w, regularization=None, lmda=0):
    predict = logPredict(X, w)
    gradient = np.dot(X.T, predict - y) 
    if regularization == 'l1':
        gradient += l1Grad(lmda, w)
    elif regularization == 'l2':
        gradient += l2Grad(lmda, w)
    return gradient / y.shape[0]

def updateWeightsGD(X, y, w, lr, regularization=None, lmda=0):
    gradient = logAvgGrad(X, y, w, regularization, lmda)
    return w - (lr * gradient)

def trainGD(X, y, lr, numIters, logFreq, regularization=None, lmda=0):
    costs = [] 
    w = np.random.rand(X.shape[1], 1)
    for i in range(numIters):
        cost = logAvgCost(X, y, w, regularization, lmda)
        w = updateWeightsGD(X, y, w, lr, regularization, lmda)
        if i % logFreq == 0: 
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    return w, costs

def trainGDMulticlass(X, y, lr, numIters, logFreq, regularization=None, lmda=0):
    uniqueYs = np.unique(y)
    ys = [] 
    for uniqueY in uniqueYs: 
        binaryYs = np.copy(y)
        binaryYs += 1 
        binaryYs[binaryYs != uniqueY + 1] = 0
        binaryYs[binaryYs == uniqueY + 1] = 1
        ys.append(binaryYs)
    ys = np.array(ys)
    ws = np.random.rand(uniqueYs.shape[0],\
        X.shape[1],1)
    costs = []
    for i in range(numIters):
        cost = 0
        for uniqueY in uniqueYs: 
            cost += logAvgCost(X, ys[uniqueY], ws[uniqueY], regularization, lmda)
            ws[uniqueY] = updateWeightsGD(X, ys[uniqueY],\
                ws[uniqueY], lr, regularization, lmda)
        cost /= uniqueYs.shape[0]
        if i % logFreq == 0:
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    costs = np.array(costs)
    return ws, costs  

def updateWeightsAdaGrad(X, y, w, lr, S, epsilon, regularization=None, lmda=0):
    gradient = logAvgGrad(X, y, w, regularization, lmda)
    S += np.square(gradient)
    multiplier = lr / np.sqrt(S + epsilon)
    return w - np.multiply(multiplier, gradient)

def trainAdaGrad(X, y, lr, numIters, logFreq, epsilon, regularization=None, lmda=0):
    costs = [] 
    w = np.random.rand(X.shape[1], 1)
    S = np.zeros((X.shape[1], 1))
    for i in range(numIters):
        cost = logAvgCost(X, y, w, regularization, lmda)
        w = updateWeightsAdaGrad(X, y, w, lr, S, epsilon, regularization, lmda)
        if i % logFreq == 0: 
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    return w, costs

def trainAdaGradMulticlass(X, y, lr, numIters, logFreq, epsilon, regularization=None, lmda=0):
    uniqueYs = np.unique(y)
    ys = [] 
    for uniqueY in uniqueYs: 
        binaryYs = np.copy(y)
        binaryYs += 1 
        binaryYs[binaryYs != uniqueY + 1] = 0
        binaryYs[binaryYs == uniqueY + 1] = 1
        ys.append(binaryYs)
    ys = np.array(ys)
    ws = np.random.rand(uniqueYs.shape[0],\
        X.shape[1],1)
    Ss = np.zeros((uniqueYs.shape[0], X.shape[1], 1))
    costs = []
    for i in range(numIters):
        cost = 0
        for uniqueY in uniqueYs: 
            cost += logAvgCost(X, ys[uniqueY], ws[uniqueY], regularization, lmda)
            ws[uniqueY] = updateWeightsAdaGrad(X, ys[uniqueY],\
                ws[uniqueY], lr, Ss[uniqueY], epsilon, regularization, lmda)
        cost /= uniqueYs.shape[0]
        if i % logFreq == 0:
            print("iter:", i, "cost:", cost)
            costs.append(cost)
    costs = np.array(costs)
    return ws, costs  

def logPredictMulticlass(X, w):
    predictions = [] 
    for i in range(ws.shape[0]):
        predictions.append(logPredict(X,w[i]))
    predictions = np.array(predictions)
    return np.argmax(predictions,axis=0)

def logMulticlassAccuracy(X, w, y):
    predictions = logPredictMulticlass(X, w)
    return np.sum(predictions == y) / y.shape[0]

In [1041]:
# Load dataset
digits = load_digits()
digX = digits.images 
digY = digits.target
# Reshape images into 1D vectors 
digX = np.reshape(digX, \
    (digX.shape[0], digX.shape[1] * digX.shape[2]))
# Normalize all images 
for i in range(digX.shape[0]):
    digX[i] = np.divide(digX[i], np.amax(digX,axis=1)[i])

# Split training and test sets 
digXTrain, digXTest, digYTrain, digYTest = \
    train_test_split(digX, digY, test_size=0.20)

# Ensure dimensionality matches 
digYTrain = np.expand_dims(digYTrain, axis=1)
digYTest = np.expand_dims(digYTest, axis=1)

In [1042]:
# Gradient Descent Training
ws, costs = trainGDMulticlass(digXTrain,\
    digYTrain, 1, 1000, 100, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(digXTrain, ws, digYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(digXTest, ws, digYTest))

iter: 0 cost: 8.94303458781355
iter: 100 cost: 0.08042876444810285
iter: 200 cost: 0.06197694728237085
iter: 300 cost: 0.05422466309170197
iter: 400 cost: 0.04970070866948016
iter: 500 cost: 0.04664201732163131
iter: 600 cost: 0.044384828955270905
iter: 700 cost: 0.04262385335638737
iter: 800 cost: 0.04119899910894402
iter: 900 cost: 0.040013617534999726
Accuracy (Training): 0.9742519137091162
Accuracy (Test): 0.9555555555555556


In [1039]:
# AdaGrad Training 
ws, costs = trainAdaGradMulticlass(digXTrain,\
    digYTrain, 1, 1000, 100, 1e-8, 'l1', 0.1)

# Evaluation 
print("Accuracy (Training):",
    logMulticlassAccuracy(digXTrain, ws, digYTrain))
print("Accuracy (Test):",\
    logMulticlassAccuracy(digXTest, ws, digYTest))

iter: 0 cost: 8.86025922751747
iter: 100 cost: 0.054348073185216825
iter: 200 cost: 0.04413316403089415
iter: 300 cost: 0.03965000904061437
iter: 400 cost: 0.03695312932698065
iter: 500 cost: 0.03510365271266992
iter: 600 cost: 0.03373364361369241
iter: 700 cost: 0.032667493011929026
iter: 800 cost: 0.031812318658794844
iter: 900 cost: 0.03110669927362718
Accuracy (Training): 0.9832985386221295
Accuracy (Test): 0.9555555555555556


In [1110]:
# Exponential Multiplicative Weights 
# Experts: 10 Weak Classifiers  
numExperts = 2
ones = np.copy(digYTrain)
ones[ones != 1] = 0
experts = []
for i in range(numExperts):
    expert, _ = trainGD(digXTrain, ones, 0.01, 200, 199)
    experts.append(expert)
experts = np.array(experts)
w = np.ones((experts.shape[0], 1))
lr = 0.01

numIters = 1000 
logFreq = 500
for i in range(numIters):
    totLoss = 0 
    for j in range(experts.shape[0]): 
        loss = logAvgCost(digXTrain, ones, experts[j])
        totLoss += loss
        w[j] *= np.exp(- lr * loss)
    if i % logFreq == 0: 
        print("Iteration:", i,"Cost:", totLoss / experts.shape[0])


iter: 0 cost: 10.370376093231787
iter: 199 cost: 0.3799373796259797
iter: 0 cost: 8.133770727483085
iter: 199 cost: 0.3893224336801635
Iteration: 0 Cost: 0.38359718457608266
Iteration: 500 Cost: 0.38359718457608266
