In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss, accuracy_score
from sklearn.utils.extmath import safe_sparse_dot

%matplotlib inline

In [3]:
import optim.gradient_descent_optimizers

In [4]:
from logistic_regression import LogisticRegressionWithSGD
from optim.gradient_descent_optimizers import SGDOptimizer 
from utils import read_sparse_h5

from optim.momentum import SGDWithMomentum
from optim.adaptive import RMSProp, AdaGrad

In [5]:
lr = 0.01
momentum = 0.9
decay = 1e-3
num_epochs = 100
batchsize = 8192
l2_reg = 1e-3
nesterov_opt = SGDWithMomentum(lr=lr, momentum=momentum, decay=decay, nesterov=True, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
momentum_opt = SGDWithMomentum(lr=lr, momentum=momentum, decay=decay, nesterov=False, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
rms_opt = RMSProp(lr=lr, decay=decay, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)

In [25]:
import os
import h5py

parent_dir = "/Users/timwee/projects/nlp/courses/harvard_nlp/HW1"
f = h5py.File(os.path.join(parent_dir, "SST1.hdf5"), 'r')

In [26]:
#f.close()

In [27]:
for k in f.keys():
    print(k)

nclasses
nfeatures
test_input_data
test_input_indices
test_input_indptr
test_input_shape
train_input_data
train_input_indices
train_input_indptr
train_input_shape
train_output
valid_input_data
valid_input_indices
valid_input_indptr
valid_input_shape
valid_output


In [28]:
X = read_sparse_h5(f, "train_input")
y = f["train_output"]
valid_X = read_sparse_h5(f, "valid_input")
valid_y = f["valid_output"]
num_features = f["nfeatures"][0]
num_classes = f["nclasses"][0] + 1

In [29]:
num_features, num_classes, np.max(y), np.min(y)

(17837, 6, 5, 1)

In [30]:
np.bincount(y), np.bincount(valid_y)

(array([    0,  7189, 27513, 79856, 33016,  9243]),
 array([  0, 139, 289, 229, 279, 165]))

In [31]:
X.shape, y.shape

((156817, 17837), (156817,))

### LR

In [32]:
clf = LogisticRegression(penalty='l2', fit_intercept=True, solver='liblinear', max_iter=300, multi_class='ovr')
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=300, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
lr = 1.0
momentum = 0.9
decay = 5e-5
num_epochs = 200
batchsize = 8192
l2_reg = 1e-4

In [35]:
momentum_opt = SGDWithMomentum(lr=lr, momentum=momentum, decay=1e-6, nesterov=False, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_moment = LogisticRegressionWithSGD()
lr_clf_moment.train_with_opt(momentum_opt, X, y, num_features, num_classes)

iteration 0, loss 1.867750
iteration 10, loss 1.277737
iteration 20, loss 1.196278
iteration 30, loss 1.162929
iteration 40, loss 1.146505
iteration 50, loss 1.137432
iteration 60, loss 1.132003
iteration 70, loss 1.128567
iteration 80, loss 1.126303
iteration 90, loss 1.124767
iteration 100, loss 1.123704
iteration 110, loss 1.122954
iteration 120, loss 1.122418
iteration 130, loss 1.122029
iteration 140, loss 1.121742
iteration 150, loss 1.121524
iteration 160, loss 1.121354
iteration 170, loss 1.121217
iteration 180, loss 1.121101
iteration 190, loss 1.120998


1.1209128090703306

In [36]:
nesterov_opt = SGDWithMomentum(lr=lr, momentum=momentum, decay=1e-6, nesterov=True, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_nest = LogisticRegressionWithSGD()
lr_clf_nest.train_with_opt(nesterov_opt, X, y, num_features, num_classes)

iteration 0, loss 1.758093
iteration 10, loss 1.277470
iteration 20, loss 1.189352
iteration 30, loss 1.153808
iteration 40, loss 1.136258
iteration 50, loss 1.126665
iteration 60, loss 1.121075
iteration 70, loss 1.117670
iteration 80, loss 1.115525
iteration 90, loss 1.114139
iteration 100, loss 1.113226
iteration 110, loss 1.112614
iteration 120, loss 1.112198
iteration 130, loss 1.111911
iteration 140, loss 1.111709
iteration 150, loss 1.111565
iteration 160, loss 1.111460
iteration 170, loss 1.111380
iteration 180, loss 1.111317
iteration 190, loss 1.111265


1.1112235275047593

In [37]:
adagrad_opt = AdaGrad(lr=1.0, decay=1e-4, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_adagrad = LogisticRegressionWithSGD()
lr_clf_adagrad.train_with_opt(adagrad_opt, X, y, num_features, num_classes)

iteration 0, loss 1.924987
iteration 10, loss 1.225160
iteration 20, loss 1.183539
iteration 30, loss 1.166655
iteration 40, loss 1.155760
iteration 50, loss 1.147670
iteration 60, loss 1.141175
iteration 70, loss 1.135705
iteration 80, loss 1.130978
iteration 90, loss 1.126832
iteration 100, loss 1.123171
iteration 110, loss 1.119929
iteration 120, loss 1.117062
iteration 130, loss 1.114534
iteration 140, loss 1.112317
iteration 150, loss 1.110382
iteration 160, loss 1.108704
iteration 170, loss 1.107257
iteration 180, loss 1.106018
iteration 190, loss 1.104965


1.1041565706330143

In [38]:
rms_opt = RMSProp(lr=0.5, decay=1e-4, rho=0.9, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_rms = LogisticRegressionWithSGD()
lr_clf_rms.train_with_opt(rms_opt, X, y, num_features, num_classes)

iteration 0, loss 2.407793
iteration 10, loss 2.172444
iteration 20, loss 2.239467
iteration 30, loss 2.178407
iteration 40, loss 2.140755
iteration 50, loss 2.058057
iteration 60, loss 1.980072
iteration 70, loss 1.957179
iteration 80, loss 1.859632
iteration 90, loss 1.890551
iteration 100, loss 1.736542
iteration 110, loss 1.642587
iteration 120, loss 1.583533
iteration 130, loss 1.528328
iteration 140, loss 1.476588
iteration 150, loss 1.429694
iteration 160, loss 1.388407
iteration 170, loss 1.355400
iteration 180, loss 1.348745
iteration 190, loss 1.303564


1.275336295690686

In [40]:
from optim.adaptive import AdaDelta
# 1.2372139256997967
adadelta_opt = AdaDelta(lr=1.0, decay=0., rho=0.95, num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_adadelta = LogisticRegressionWithSGD()
lr_clf_adadelta.train_with_opt(adadelta_opt, X, y, num_features, num_classes)

iteration 0, loss 1.783147
iteration 10, loss 1.718436
iteration 20, loss 1.678460
iteration 30, loss 1.644345
iteration 40, loss 1.613728
iteration 50, loss 1.585991
iteration 60, loss 1.560825
iteration 70, loss 1.538003
iteration 80, loss 1.517297
iteration 90, loss 1.498474
iteration 100, loss 1.481311
iteration 110, loss 1.465602
iteration 120, loss 1.451166
iteration 130, loss 1.437845
iteration 140, loss 1.425504
iteration 150, loss 1.414029
iteration 160, loss 1.403320
iteration 170, loss 1.393296
iteration 180, loss 1.383886
iteration 190, loss 1.375029


1.3674878700555071

In [41]:
from optim.adaptive import Adam

adam_opt = Adam(num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_adam = LogisticRegressionWithSGD()
lr_clf_adam.train_with_opt(adam_opt, X, y, num_features, num_classes)

iteration 0, loss 1.770095
iteration 10, loss 1.667895
iteration 20, loss 1.623513
iteration 30, loss 1.586685
iteration 40, loss 1.553058
iteration 50, loss 1.521557
iteration 60, loss 1.491873
iteration 70, loss 1.463917
iteration 80, loss 1.437663
iteration 90, loss 1.413089
iteration 100, loss 1.390165
iteration 110, loss 1.368842
iteration 120, loss 1.349059
iteration 130, loss 1.330740
iteration 140, loss 1.313801
iteration 150, loss 1.298153
iteration 160, loss 1.283707
iteration 170, loss 1.270373
iteration 180, loss 1.258066
iteration 190, loss 1.246703


1.2372220165377927

In [42]:
from optim.adaptive import Adamax

adamax_opt = Adamax(num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_adamax = LogisticRegressionWithSGD()
lr_clf_adamax.train_with_opt(adamax_opt, X, y, num_features, num_classes)

iteration 0, loss 1.739713
iteration 10, loss 1.598652
iteration 20, loss 1.548668
iteration 30, loss 1.510420
iteration 40, loss 1.477787
iteration 50, loss 1.449104
iteration 60, loss 1.423650
iteration 70, loss 1.400934
iteration 80, loss 1.380566
iteration 90, loss 1.362231
iteration 100, loss 1.345645
iteration 110, loss 1.330576
iteration 120, loss 1.316849
iteration 130, loss 1.304305
iteration 140, loss 1.292810
iteration 150, loss 1.282248
iteration 160, loss 1.272523
iteration 170, loss 1.263553
iteration 180, loss 1.255262
iteration 190, loss 1.247589


1.2411628940643151

In [43]:
from optim.adaptive import NAdam

nadam_opt = NAdam(num_epochs=num_epochs, batchsize=batchsize, l2_reg=l2_reg)
lr_clf_nadam = LogisticRegressionWithSGD()
lr_clf_nadam.train_with_opt(nadam_opt, X, y, num_features, num_classes)

iteration 0, loss 1.770167
iteration 10, loss 1.623474
iteration 20, loss 1.526955
iteration 30, loss 1.452125
iteration 40, loss 1.392531
iteration 50, loss 1.344565
iteration 60, loss 1.305654
iteration 70, loss 1.273845
iteration 80, loss 1.247647
iteration 90, loss 1.225917
iteration 100, loss 1.207783
iteration 110, loss 1.192566
iteration 120, loss 1.179738
iteration 130, loss 1.168884
iteration 140, loss 1.159668
iteration 150, loss 1.151822
iteration 160, loss 1.145127
iteration 170, loss 1.139404
iteration 180, loss 1.134505
iteration 190, loss 1.130308


1.1270445312058488

### results

### sklearn

In [44]:
sklearn_probs = clf.predict_proba(X)
log_loss(y, sklearn_probs), accuracy_score(y, np.argmax(sklearn_probs, axis=1))

(0.75384940270759671, 0.13058533194742916)

In [45]:
sk_valid_probs = clf.predict_proba(valid_X)
log_loss(valid_y, sk_valid_probs), accuracy_score(valid_y, np.argmax(sk_valid_probs, axis=1))

(1.575677985478658, 0.22161671207992734)

### no momentum sgd

In [46]:
probs = lr_clf.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.48937296339044872

In [47]:
valid_probs = lr_clf.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.30881017257039056

### momentum sgd

In [48]:
probs = lr_clf_moment.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.5153777970500647

In [49]:
valid_probs = lr_clf_moment.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.36148955495004542

### nesterov sgd

In [50]:
probs = lr_clf_nest.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.51787752603352954

In [51]:
valid_probs = lr_clf_nest.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.35513169845594916

### rms prop

In [52]:
probs = lr_clf_rms.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.53857681246293454

In [53]:
valid_probs = lr_clf_rms.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.34604904632152589

### adagrad

In [54]:
probs = lr_clf_adagrad.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.58319569944585092

In [55]:
valid_probs = lr_clf_adagrad.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.36603088101725706

### adadelta

In [56]:
probs = lr_clf_adadelta.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.54448178450040496

In [57]:
valid_probs = lr_clf_adadelta.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.35149863760217986

### adam

In [58]:
probs = lr_clf_adam.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.5700529917037056

In [59]:
valid_probs = lr_clf_adam.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.36512261580381472

### adamax

In [60]:
probs = lr_clf_adamax.predict_proba(X)
accuracy_score(y, np.argmax(probs, axis=1))

0.57445302486337579

In [61]:
valid_probs = lr_clf_adamax.predict_proba(valid_X)
accuracy_score(valid_y, np.argmax(valid_probs, axis=1))

0.36512261580381472

### nadam