In [None]:
import requests
train_set = requests.get("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a")
val_set = requests.get("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t")

In [None]:
from io import BytesIO
from sklearn.datasets import load_svmlight_file

x_train, y_train = load_svmlight_file(BytesIO(train_set.content), n_features=123)
x_val, y_val = load_svmlight_file(BytesIO(val_set.content), n_features=123)
x_train = x_train.toarray()
x_val = x_val.toarray()

In [None]:
import numpy
from collections import defaultdict

n_samples_train, n_features_train = x_train.shape
x_train = numpy.concatenate((x_train, numpy.ones(shape=(n_samples_train, 1))), axis=1)
y_train = y_train.reshape((n_samples_train, 1))

n_samples_val, n_features_val = x_val.shape
x_val = numpy.concatenate((x_val, numpy.ones(shape=(n_samples_val, 1))), axis=1)
y_val = y_val.reshape((n_samples_val, 1))

In [None]:
class Model(object):
    def __init__(self, n_features):
        self.params = numpy.random.random(size=(n_features, 1))
        self.diffs = numpy.zeros((n_features, 1))
        self.recorder = defaultdict(list)
        
    def train(self, x, y):
        pass

    def validate(self, x, y):
        self.__loss__(x, y, "validation")

    def predict(self, x):
        pass

    def __calculate_gradient__(self, params=None):
        pass

    def __loss__(self, x, y, key):
        pass


In [None]:
class LogisticRegression(Model):
    def __init__(self, n_features):
        super(LogisticRegression, self).__init__(n_features=n_features)
        self.x_train = None
        self.y_train = None

    def train(self, x, y):
        self.x_train = x
        self.y_train = y

    def predict(self, x):
        return numpy.where(numpy.dot(x, self.params) > 0, 1, 0)

    def __calculate_gradient__(self, params=None):
        if params is None:
            params = self.params
        y_hat = 1 / (1 + numpy.exp(-numpy.dot(self.x_train, params)))
        self.diffs = numpy.dot(self.x_train.transpose(), (y_hat - self.y_train))

    def __loss__(self, x, y, key):
        y_hat = 1 / (1 + numpy.exp(-numpy.dot(x, self.params)))
        loss = -numpy.average(y * numpy.log(y_hat) + (1 - y) * numpy.log(1 - y_hat))
        self.recorder[key].append(loss)

In [None]:
class Optimizer(object):
    def __init__(self, model):
        self.model = model
        self.color = None

    def step(self):
        pass

In [None]:
class SGD(Optimizer):
    def __init__(self, model, learning_rate, momentum=None):
        super(SGD, self).__init__(model=model)
        self.color = "r"
        self.learning_rate = learning_rate
        self.momentum = momentum
        if momentum is not None:
            self.v = numpy.zeros_like(self.model.diffs)

    def step(self):
        self.model.__calculate_gradient__()
        if self.momentum is None:
            self.model.params -= self.learning_rate * self.model.diffs
        else:
            self.v = self.momentum * self.v + self.learning_rate * self.model.diffs
            self.model.params -= self.v

In [None]:
class NAG(Optimizer):
    def __init__(self, model, learning_rate, momentum):
        super(NAG, self).__init__(model=model)
        self.color = "y"
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.v = numpy.zeros_like(self.model.diffs)
    def step(self):
        self.model.__calculate_gradient__(params=self.model.params - self.momentum * self.v)
        self.v = self.momentum * self.v + self.learning_rate * self.model.diffs
        self.model.params -= self.v
    

In [None]:
class Adam(Optimizer):
    def __init__(self, model, beta, gamma, eta):
        super(Adam, self).__init__(model=model)
        self.color = "g"
        self.beta = beta
        self.gamma = gamma
        self.eta = eta
        self.m = numpy.zeros_like(self.model.diffs)
        self.G = numpy.zeros_like(self.model.diffs)
        self.epsilon = 1e-8

    def step(self):
        self.model.__calculate_gradient__()
        self.m = self.beta * self.m + (1 - self.beta) * self.model.diffs
        self.G = self.gamma * self.G + (1 - self.gamma) * self.model.diffs * self.model.diffs
        alpha = self.eta * (numpy.sqrt(1 - self.gamma)) / (1 - self.beta)
        self.model.params -= alpha * self.m / numpy.sqrt(self.G + self.epsilon)

In [None]:
class AdaDelta(Optimizer):
    def __init__(self, model, gamma):
        super(AdaDelta, self).__init__(model=model)
        self.color = "b"
        self.gamma = gamma
        self.G = numpy.zeros_like(self.model.diffs)
        self.delta = numpy.zeros_like(self.model.diffs)
        self.delta_theta = numpy.zeros_like(self.model.diffs)
        self.epsilon = 1e-4

    def step(self):
        self.model.__calculate_gradient__()
        self.G = self.gamma * self.G + (1 - self.gamma) * self.model.diffs * self.model.diffs
        self.delta_theta = -(numpy.sqrt(self.delta + self.epsilon)
                             / numpy.sqrt(self.G + self.epsilon)) * self.model.diffs
        self.model.params += self.delta_theta
        self.delta = self.gamma * self.delta + (1 - self.gamma) * self.delta_theta * self.delta_theta


In [None]:
class RMSProP(Optimizer):
    def __init__(self, model, leaning_rate, weight_decay):
        self.color = "c"
        super(RMSProP, self).__init__(model=model)
        self.G = numpy.zeros_like(self.model.diffs)
        self.learning_rate = leaning_rate
        self.weight_decay = weight_decay
        self.epsilon = 1e-8

    def step(self):
        self.model.__calculate_gradient__()
        self.G = self.weight_decay * self.G + (1 - self.weight_decay) * self.model.diffs * self.model.diffs
        self.model.params -= self.learning_rate / numpy.sqrt(self.G + self.epsilon) * self.model.diffs


In [None]:
y_train = numpy.where(y_train == -1, 0, y_train)
y_val = numpy.where(y_val == -1, 0, y_val)
optimizers = [
    SGD(model=LogisticRegression(n_features=123 + 1), learning_rate=0.00001, momentum=0.5),
    NAG(model=LogisticRegression(n_features=123 + 1), learning_rate=0.00001, momentum=0.5),
    Adam(model=LogisticRegression(n_features=123 + 1), beta=0.9, gamma=0.999, eta=0.1),
    AdaDelta(model=LogisticRegression(n_features=123 + 1), gamma=0.95),
    RMSProP(model=LogisticRegression(n_features=123 + 1), leaning_rate=0.1, weight_decay=0.9)
]

In [None]:
max_epoch = 100
batch_size = 10000
for epoch in range(max_epoch):
    indexes = numpy.random.randint(0, n_samples_train, size=batch_size)
    for optimizer in optimizers:
        optimizer.model.train(x_train[indexes], y_train[indexes])
        optimizer.step()
        optimizer.model.validate(x_val, y_val)

from sklearn.metrics import classification_report

print("\t\t\t"+optimizers[0].model.__class__.__name__)
print("-"*60)
for optimizer in optimizers:
    print("\t\t\t"+optimizer.__class__.__name__)
    print("-"*60)
    print(classification_report(y_val,optimizer.model.predict(x_val),target_names=["positive", "negative"],digits=3))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(16,9))
plt.xlabel("epoch")
plt.ylabel("loss")
plt.title("Logistic Regression and Stocastic Gradient Descent")
for optimizer in optimizers:
    plt.plot(optimizer.model.recorder["validation"], color=optimizer.color, label=optimizer.__class__.__name__)
plt.legend()
plt.show()