<a href="https://colab.research.google.com/github/sverdoot/optimizer-SUG-torch/blob/master/IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from torchtext import data, datasets
from torchtext.vocab import GloVe

# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, pad_token='<PAD_TOKEN>', eos_token='<EOS_TOKEN>', unk_token='<UNK_TOKEN>')
LABEL = data.Field(sequential=False)

# make splits for data
train_, test_ = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(train_, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train_)

In [67]:
len(train_), len(test_)

(1148, 25000)

In [0]:
%matplotlib inline
import torch
from torch import nn
from torch import optim
from torch.autograd import Variable
import time
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
# make iterator for splits
train_iter, val_iter = data.BucketIterator.splits(
    (train_, test_), batch_size=4, device=device, sort_within_batch=True, sort_key=lambda x: len(x.text))

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Optimization project")
os.getcwd()

file_path = "/content/drive/My Drive/Colab Notebooks/Optimization project/IMDB"
#directory = os.path.dirname(file_path)

try:
    os.stat(file_path)
except:
os.mkdir(file_path)    

In [0]:
import sug
from sug import SUG

## Data loading

In [0]:
a, b = next(iter(train_iter))

In [0]:
(inputs, lens), labels = a

In [0]:
inputs.size(), lens.size(), labels.size()

## Model

In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
 
class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super().__init__() 
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        
        self.linear1 = nn.Linear(2 * hidden_dim, 20)
        self.linear1.weight.data.fill_(0.01)
        self.linear2 = nn.Linear(20, 2)
        self.linear2.weight.data.fill_(0.01)
 
    def forward(self, seq, lens):
        embeds = self.embedding(seq)
        packed = pack_padded_sequence(embeds, lens, batch_first=True)
        hdn, _ = self.encoder(packed)
        hdn, _ = pad_packed_sequence(hdn, batch_first=True)
        output = nn.functional.max_pool1d(hdn, kernel_size=10)
        output = F.relu(self.linear1(hdn[:,1,:]))
        prob = F.log_softmax(self.linear2(output), -1)
        
        return prob

## Training

In [0]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def model_step(model, optimizer, criterion, inputs, lens, labels):
    outputs = model(inputs, lens)
    loss = criterion(outputs, labels)

    if model.training:
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
    if optimizer.__class__.__name__ != 'SUG':
        optimizer.step()
    else:
        def closure():
            optimizer.zero_grad()
            upd_outputs = model(inputs, lens)
            upd_loss = criterion(upd_outputs, labels)
            upd_loss.backward()
            return upd_loss

        optimizer.step(loss, closure)

    return loss.item()

In [0]:
def train(model, trainloader, criterion, optimizer, path=None, n_epochs=2, validloader=None, eps=1e-5, print_every=1):
    tr_loss, val_loss, lips, times, grad, acc = ([] for i in range(6))
    start_time = time.time()
    model.to(device=device)
    print(len(list(trainloader)))
    for ep in range(n_epochs):
        model.train()
        i = 0
        for i, (data, b) in enumerate(trainloader):
            (inputs, lens), labels = data
            inputs, labels = Variable(inputs).to(device=device), Variable(labels).to(device=device)

            tr_loss.append(model_step(model, optimizer, criterion, inputs, lens, labels))      
                
            if optimizer.__class__.__name__ == 'SUG':
                lips.append(optimizer.get_lipsitz_const())
                grad.append(optimizer.get_sq_grad)
            if i % 10 == 0:
                print(tr_loss[-1], i)
                times.append(time_since(start_time))
                model.zero_grad()
                optimizer.zero_grad()
                states = {
                         'epoch': n_epochs,
                         'state_dict': model.state_dict(),
                         'optimizer': optimizer.state_dict(),
                         'tr_loss' : tr_loss,
                         'val_loss' : val_loss,
                         'lips' : lips,
                         'grad' : grad,
                         'times' : times
                         }     
                if path is not None:
                    torch.save(states, path)
        
        times.append(time_since(start_time))
        if ep % print_every == 0:
            print("Epoch {}, training loss {}, time passed {}".format(ep, sum(tr_loss[-i:]) / i, time_since(start_time)))

        #if validloader is None:
        #    continue
        #model.zero_grad()
        #model.eval()
        #j = 0
        #for j, (data, b) in enumerate(validloader):
        #    (inputs, lens), labels = data
        #    inputs, labels = inputs.to(device=device), labels.to(device=device)
        #    val_loss.append(model_step(model, optimizer, criterion, inputs, lens, labels))
        #if ep % print_every == 0:
        #    print("Validation loss {}".format(sum(val_loss[-j:]) / j))
        #
    return tr_loss, times, val_loss, lips, grad

In [0]:
def concat_states(state1, state2):
    states = {
            'epoch': state1['epoch'] + state2['epoch'],
            'state_dict': state2['state_dict'],
            'optimizer': state2['optimizer'],
            'tr_loss' : state1['tr_loss'] + state2['tr_loss'],
            'val_loss' : state1['val_loss'] + state2['val_loss'],
            'lips' : state1['lips'] + state2['lips'],
            'grad' : state1['grad'] + state2['grad'],
            #'times' : state1['times'] + list(map(lambda x: x + state1['times'][-1],state2['times']))
             'times' : state1['times'] + state2['times']
             }
    return states

In [0]:
print_every = 1
n_epochs = 1
tr_loss = {}
tr_loss['sgd'] = {}
val_loss = {}
val_loss['sgd'] = {}
lrs = [0.01, 0.005]
em_sz = 100
nh = 500
nl = 3
torch.manual_seed(999)

criterion = nn.CrossEntropyLoss()

In [120]:
len(list(train_iter))

287

In [0]:
val_iter_ = val_iter[:len(list(train_iter))]

In [0]:
for lr in lrs:
  model = SimpleLSTMBaseline(nh, emb_dim=em_sz)
  print("SGD  lr={}, momentum=0. :".format(lr))
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.)
  tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=val_iter)
  states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad,
            'times' : times
             }
  torch.save(states, './IMDB/LSTM_' + str(lr))

In [127]:
l_0 = 2
model = SimpleLSTMBaseline(nh, emb_dim=em_sz)
print("SUG  l_0={}, momentum=0. :".format(l_0))
optimizer = SUG(model.parameters(), l_0=l_0, momentum=0.)
tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad = train(model, train_iter, criterion, optimizer, path='./IMDB/LSTM_sug', n_epochs=n_epochs, print_every=print_every, validloader=val_iter)
states = {
          'epoch': n_epochs,
          'state_dict': model.state_dict(),
          'optimizer': optimizer.state_dict(),
          'tr_loss' : tr_loss['sgd'][lr],
          'val_loss' : val_loss['sgd'][lr],
          'lips' : lips,
          'grad' : grad,
          'times' : times
           }
torch.save(states, './IMDB/LSTM_sug')

SUG  l_0=2, momentum=0. :
287
0.7064298391342163 0
0.10949906706809998 10
0.03670758008956909 20
0.019243312999606133 30
0.010921110399067402 40
0.007866695523262024 50
0.005721421912312508 60
0.005002093501389027 70
0.0037729735486209393 80
0.0030055076349526644 90
0.0026158031541854143 100
0.0025622034445405006 110
0.00234255101531744 120
0.0017376202158629894 130
0.001502203056588769 140
0.0014089700998738408 150
0.001433861325494945 160
0.001311229425482452 170
0.0011016679927706718 180
0.00109789427369833 190
0.0009428689954802394 200
0.0008233338594436646 210
0.0009007734479382634 220
0.0007441784837283194 230
0.0006538216257467866 240
0.0006377806421369314 250
0.0005877303192391992 260
0.0006665069377049804 270
0.0005683102062903345 280
Epoch 0, training loss 0.014863820818384745, time passed 143m 17s


In [105]:
lr = 0.05
model = SimpleLSTMBaseline(nh, emb_dim=em_sz)
print("SGD  lr={}, momentum=0. :".format(lr))
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.)
tr_loss['sgd'][lr], times, val_loss['sgd'][lr], lips, grad = train(model, train_iter, criterion, optimizer, n_epochs=n_epochs, print_every=print_every, validloader=val_iter)
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad,
            'times' : times
             }
torch.save(states, './IMDB/LSTM_' + str(lr))

SGD  lr=0.05, momentum=0. :
287
0.5988515615463257 0
0.41938233375549316 10
0.3074619472026825 20
0.23368413746356964 30
0.18257474899291992 40
0.14501415193080902 50
0.11993587017059326 60
0.09823356568813324 70
0.08181726187467575 80
0.06902245432138443 90
0.05924888700246811 100
0.05311816185712814 110
0.04379794001579285 120
0.03996816277503967 130
0.03634902834892273 140
0.030864223837852478 150
0.027338199317455292 160
0.024336742237210274 170
0.022301625460386276 180
0.021165722981095314 190
0.019693579524755478 200
0.018488040193915367 210
0.016197815537452698 220
0.01575876586139202 230
0.013274822384119034 240
0.013141696341335773 250
0.012804633937776089 260
0.011515684425830841 270
0.011610937304794788 280
Epoch 0, training loss 0.08387096615026256, time passed 71m 45s
