#Setup

In [1]:
!pip install pkbar



In [2]:
from google.colab import drive
from google.colab import files
import sys
import time

drive.mount('/content/gdrive/', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Colab Notebooks/MS Thesis/PSGD Paper/'
results_dir = base_dir + 'CNN/results_fashion/'
logs_dir = base_dir + 'log'
sys.path.append(base_dir)
import preconditioned_stochastic_gradient_descent as psgd 


Mounted at /content/gdrive/


In [3]:
import matplotlib.pyplot as plt
import torch
from torch.autograd import grad
import torch.nn.functional as F
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import time
import tqdm
import pkbar
import math

from tabulate import tabulate
import scipy.io
from sklearn import metrics
import plotly.express as px
from torchsummary import summary


# Functions

In [4]:
def plot_loss_metrics(xaxis,yaxis,title, x_label,y_label):
 
  fig = go.Figure()
  i = 0
  if(xaxis != None):
    for opt in opts:
      fig.add_trace(go.Scatter(x = xaxis[opt], y=yaxis[opt], name = opt, mode='lines', line = dict(color = colors[i])))
      i = i + 1
  else:
    for opt in opts:
      fig.add_trace(go.Scatter(y=yaxis[opt], name = opt, mode='lines', line = dict(color = colors[i])))
      i = i + 1

  fig.update_layout(title=title, xaxis_title=x_label, yaxis_title=y_label, yaxis_type="log")
  fig.show()
  fig.write_html(results_dir + title + ".html")

def plot_acc_metrics(xaxis,yaxis,title, x_label,y_label):
 
  fig = go.Figure()
  i = 0
  if(xaxis != None):
    for opt in opts:
      fig.add_trace(go.Scatter(x = xaxis[opt], y=yaxis[opt], name = opt, mode='lines', line = dict(color = colors[i])))
      i = i + 1
  else:
    for opt in opts:
      fig.add_trace(go.Scatter(y=yaxis[opt], name = opt, mode='lines', line = dict(color = colors[i])))
      i = i + 1

  fig.update_layout(title=title, xaxis_title=x_label, yaxis_title=y_label, yaxis=dict(range=[0.75, 1]))
  fig.show()
  fig.write_html(results_dir + title + ".html")



In [5]:
np.random.seed(0)

# Parameter Settings
BATCH_SIZE = 64
test_BATCH_SIZE = 1000
EPOCHS = 20
GAP = 100

# Data Download

In [6]:
# train_loader = torch.utils.data.DataLoader(
#         datasets.FashionMNIST('./data', train=True, download=True,           
#                        transform=transforms.Compose([                       
#                                transforms.ToTensor()])),    
#                         batch_size=BATCH_SIZE, shuffle=True, num_workers = 4, pin_memory = True)
# test_loader = torch.utils.data.DataLoader(    
#         datasets.FashionMNIST('./data', train=False, transform=transforms.Compose([
#                        transforms.ToTensor()])),    
#                         batch_size=test_BATCH_SIZE, shuffle=True, num_workers=4, pin_memory = True)
train_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10('./data', train=True, download=True,           
                       transform=transforms.Compose([                       
                               transforms.ToTensor()])),    
                        batch_size=BATCH_SIZE, shuffle=True, num_workers = 4, pin_memory = True)
test_loader = torch.utils.data.DataLoader(    
        datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
                       transforms.ToTensor()])),    
                        batch_size=test_BATCH_SIZE, shuffle=True, num_workers=4, pin_memory = True)

Files already downloaded and verified



This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.



In [7]:
n_batches = np.ceil(len(train_loader.dataset)/BATCH_SIZE)
n_test_batches = np.ceil(len(test_loader.dataset)/test_BATCH_SIZE)

In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
torch.cuda.device_count()
torch.cuda.get_device_name(0)

Running on the GPU


'Tesla T4'

# DPSGD-M Class

In [9]:
import torch
import torch.nn.functional as F

from torch.optim.optimizer import Optimizer
from preconds import *
from torch.autograd import grad

class DPSGD_M(Optimizer):

    def __init__(self, model, lr, criterion, grad_clip = 1e9, damping = 2, tau = 1, T1 = 1, T2 = 5, omega = 0, eta = 1e-5, beta = 0):
        """ DPSGD_M Preconditionner for Linear, Conv2d layers, RNN, LSTM Layers.
        Computes the K-FAC of the second moment of the gradients.
        Args:
            model (torch.nn.Module): Model to precondition.
            lr: learning rate
            damping: 0 --> No damping
                     1 --> 1st approach
                     2 --> 2nd approach
            tau: Tikhonov coefficient initialization (1 is recommeneded)
            T1: iterations after which to update preconditioner
            T2: iterations after which to update tau
            beta: Weight for Polyak Averaging of preconditioner from two frequent iterations
            grad_clip: to clip preconditioned gradients.
                       The recommended way is to calculate from model parameters as = 0.1*sum(W.numel() for W in Ws)**0.5
            eta: a very small value added to tau calculation
            criterion: loss function nn Loss function
            omega: factor by which tau is scaled after T2 iterations
        """
        self.model = model
        self.lr = lr
        self.damping = damping
        self.tau = tau
        self.T1 = T1
        self.T2 = T2
        self.beta = beta
        if omega == 0:
            self.omega = (19/20)**T2
        else:
            self.omega = omega
        self.grad_clip = grad_clip
        self.criterion = criterion
        self.eta = eta

        self.params = []
        self._iteration_counter = 0
        self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")


        for mod in model.modules():
            mod_class = mod.__class__.__name__
            if mod_class in ['Linear', 'Conv2d']:
                params = [mod.weight]
                if mod.bias is not None:
                    params.append(mod.bias)
                d = {'params': params, 'mod': mod, 'layer_type': mod_class}
                self.params.append(d)

        super(DPSGD_M, self).__init__(self.params, {})

    def step(self, loss, data, targets):
        
        # saving gradients
        grad_norm = 0
        grads = [p.grad.data for group in self.param_groups for p in group['params']]
        
        for group in self.param_groups:
            # Getting parameters
            if len(group['params']) == 2:
                weight, bias = group['params']
            else:
                weight = group['params'][0]
                bias = None
            state = self.state[weight]
            
            # Update preconditioner
            if self._iteration_counter % self.T1 == 0:
                self._update_preconditioner(weight, bias, state, group)

            # Preconditioning of gradients
            gw, gb = self._precond(weight, bias, group, state)

            # Updating gradients
            weight.grad.data = gw
            grad_norm += (weight.grad * weight.grad).sum()
            
            if bias is not None:
                bias.grad.data = gb
                grad_norm += (bias.grad * bias.grad).sum()
                

        # scaling gradients
        pre_grads = [p.grad.data for group in self.param_groups for p in group['params']]
        step_adjust = min(self.grad_clip/(grad_norm + 1.2e-38), 1.0)
        step_size = self.lr*step_adjust
        for group in self.param_groups:
            for param in group['params']:
                # param.grad.data *= step_adjust
                param.data -= step_size * param.grad.data
        
        # update tau
        if self.damping != 0 and self._iteration_counter % self.T2 == 0 and self.tau > 1e-10:
            self.update_tau(loss, data, targets, grads, pre_grads, step_size)

        self._iteration_counter += 1


    def _update_preconditioner(self, weight, bias, state, group):
        """update preconditioner"""
        s = weight.grad.shape
        
        dw = torch.randn(weight.shape).to(self.device)
        (dg,) = grad(weight.grad, weight, dw, retain_graph=True)
        
        if group['layer_type'] == 'Conv2d':
            dw = dw.contiguous().view(s[0], s[1]*s[2]*s[3])
            dg = dg.contiguous().view(s[0], s[1]*s[2]*s[3])

        if bias is not None:
            dwb = torch.randn(bias.shape).to(self.device)
            (dgb,) = grad(bias.grad, bias, dwb, retain_graph=True)

            dw = torch.cat([dw, dwb.view(dwb.shape[0], 1)], dim=1)
            dg = torch.cat([dg, dgb.view(dgb.shape[0], 1)], dim=1)
        
        if self._iteration_counter == 0:
            # initialize ql and qr for the current parameter
            state['ql'] = torch.eye(dw.shape[0]).to(self.device)
            state['qr'] = torch.eye(dw.shape[1]).to(self.device)

            if self.beta > 0:
                state['pl'] = torch.zeros((dw.shape[0], dw.shape[0])).to(self.device)
                state['pr'] = torch.zeros((dw.shape[1], dw.shape[1])).to(self.device)

        state['ql'], state['qr'] = update_precond_kron(state['ql'], state['qr'], dw, dg)
        

    def _precond(self, weight, bias, group, state):
        """Applies preconditioning."""
        
        
        g = weight.grad.data
        s = g.shape

        if group['layer_type'] == 'Conv2d':
            g = g.contiguous().view(s[0], s[1]*s[2]*s[3])
        if bias is not None:
            gb = bias.grad.data
            g = torch.cat([g, gb.view(gb.shape[0], 1)], dim=1)

        # preconditioned gradients
        Ql = state['ql']
        Qr = state['qr']

        if self.beta == 0:
            if self.damping == 0:
                g = precond_grad_kron(Ql, Qr, g)
            elif self.damping == 1:
                g = precond_grad_kron(Ql, Qr, g) +  ((self.tau+self.eta)**0.5)*g
            else:
                g = precond_grad_kron_damped(Ql, Qr, g, self.tau, self.eta)
        else:
            Pl = state['pl']
            Pr = state['pr']
            if self.damping == 0:
                state['pl'], state['pr'], pl, pr = precond_kron_momentum(Ql, Qr, Pl, Pr, self.beta)
                g = pl.mm(g).mm(pr)
            elif self.damping == 1:
                state['pl'], state['pr'], pl, pr = precond_kron_momentum(Ql, Qr, Pl, Pr, self.beta)
                g = pl.mm(g).mm(pr) +  ((self.tau+self.eta)**0.5)*g
            else:
                state['pl'], state['pr'], pl, pr = precond_kron_damped_momentum(Ql, Qr, Pl, Pr, self.beta, self.eta, self.tau)
                g = pl.mm(g).mm(pr)
            
        if bias is not None:
            gb = g[:, -1].contiguous().view(*bias.shape)
            g = g[:, :-1]
        else:
            gb = None
        g = g.contiguous().view(*s)
        return g, gb
    
    def update_tau(self, loss, data, targets, grads, pre_grads, step_size):  

        loss2 = self.criterion(self.model(data), targets)
        L = [0.5*torch.dot(g.view(-1,), step_size*pg.view(-1,)) for (g, pg) in zip(grads, pre_grads)]
        M = min(L)#sum(L)/len(L)
        r = abs(loss2 - loss)/M
        if r > 3/4:
            self.tau = self.tau*self.omega
        elif r < 1/4:
            self.tau = self.tau / self.omega

In [10]:
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5)
        self.pool = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = nn.Linear(256, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 256)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [11]:
class Net(nn.Module):
    def __init__(self, num_classes=10):
        super(Net, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 2 * 2, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 2 * 2)
        x = self.classifier(x)
        return x

In [12]:
'''
Modified from https://github.com/pytorch/vision.git
'''
import math

import torch.nn as nn
import torch.nn.init as init

__all__ = [
    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
    'vgg19_bn', 'vgg19',
]


class VGG(nn.Module):
    '''
    VGG model 
    '''
    def __init__(self, features):
        super(VGG, self).__init__()
        self.features = features
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Linear(512, 10),
        )
         # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                m.bias.data.zero_()


    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfg = {
    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 
          512, 512, 512, 512, 'M'],
}


def vgg11():
    """VGG 11-layer model (configuration "A")"""
    return VGG(make_layers(cfg['A']))


def vgg11_bn():
    """VGG 11-layer model (configuration "A") with batch normalization"""
    return VGG(make_layers(cfg['A'], batch_norm=True))



# PSGD

In [None]:
torch.manual_seed(1)
# model = Net().to(device)
model = vgg11_bn().to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
grad_clip = 0.1*np.sqrt(sum([W.numel() for W in model.parameters()]))

preconditioner = DPSGD_M(model, lr = 0.1, criterion = loss_function, grad_clip = grad_clip, damping = 0, 
                         tau = 1, T1 = 1, T2 = 5, omega = 0, eta = 1e-5, beta = 0)

def test_loss_K(model):
    model.eval()
    loss = 0
    accuracy = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            y = model(data)
            loss += loss_function(y, target)
            _, pred = torch.max(y, dim=1)
            accuracy += (pred == target).sum(dtype=torch.float32)/pred.size(0)
    return loss.item()/n_test_batches, accuracy/n_test_batches

TrainLoss, TestLoss = [], []
TrainAcc, TestAcc = [], []
times = []

for epoch in range(EPOCHS):
    kbar = pkbar.Kbar(target=n_batches, epoch=epoch, num_epochs=EPOCHS, width=30, always_stateful=False, interval = 1)
    trainloss = 0.0
    trainacc = 0.0
    n = 0
    model.train()
    t0 = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):

        data, target = data.to(device), target.to(device)
        preconditioner.zero_grad()
        optimizer.zero_grad()
        output = model(data)
        
        loss = loss_function(output, target)
        _, max_ind = torch.max(output, dim = 1)
        accuracy = (max_ind == target).sum(dtype=torch.float32)/max_ind.size(0) 

        trainloss += loss
        trainacc += accuracy

        loss.backward(create_graph = True)
        preconditioner.step(loss, data, target)

        kbar.update(n, values=[("loss", loss.item()), ("acc", accuracy.item())])
        n += 1
        
    t1 = time.time() - t0
    times.append(t1)

    TrainLoss.append(trainloss.item()/n_batches)
    TrainAcc.append(trainacc.item()/n_batches)
    
    
    # preconditioner.lr = 0.01**(1/9)*preconditioner.lr
    testloss, testacc = test_loss_K(model)

    TestLoss.append(testloss)
    TestAcc.append(testacc)
    kbar.add(1, values=[("val_loss", testloss), ("val_acc", testacc)])
    # print('Epoch: {}; train loss: {}; test loss: {}, train_accuracy: {}, test_accuracy:{}, time: {}'\
    #  .format(epoch+1, TrainLoss[-1], TestLoss[-1], TrainAcc[-1], TestAcc[-1],np.sum(times)))

scipy.io.savemat(results_dir + 'PSGD.mat', {'TrainLoss': TrainLoss, 'TestLoss': TestLoss, 'TrainAccuracy': TrainAcc,'TestAccuracy': TestAcc, 'Time':times})

# DPSGD-M

In [17]:
torch.manual_seed(1)
model = Net().to(device)

loss_function =  nn.CrossEntropyLoss()
grad_clip = 0.1*np.sqrt(sum([W.numel() for W in model.parameters()]))

preconditioner = DPSGD_M(model, lr = 0.1, criterion = loss_function, grad_clip = grad_clip, damping = 2, 
                         tau = 1, T1 = 1, T2 = 5, omega = 0, eta = 1e-5, beta = 0.1)

def test_loss_K(model):
    model.eval()
    loss = 0
    accuracy = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            y = model(data)
            loss += loss_function(y, target)
            _, pred = torch.max(y, dim=1)
            accuracy += (pred == target).sum(dtype=torch.float32)/pred.size(0)
    return loss.item()/n_test_batches, accuracy/n_test_batches

TrainLoss, TestLoss = [], []
TrainAcc, TestAcc = [], []
times = []

for epoch in range(EPOCHS):
    kbar = pkbar.Kbar(target=n_batches, epoch=epoch, num_epochs=EPOCHS, width=30, always_stateful=False, interval = 1)
    trainloss = 0.0
    trainacc = 0.0
    n = 0
    model.train()
    t0 = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):

        data, target = data.to(device), target.to(device)
        preconditioner.zero_grad()
        output = model(data)
        
        loss = loss_function(output, target)
        _, max_ind = torch.max(output, dim = 1)
        accuracy = (max_ind == target).sum(dtype=torch.float32)/max_ind.size(0) 

        trainloss += loss
        trainacc += accuracy

        loss.backward(create_graph = True)
        preconditioner.step(loss, data, target)
        # optimizer.step()

        kbar.update(n, values=[("loss", loss.item()), ("acc", accuracy.item())])
        n += 1
        
    t1 = time.time() - t0
    times.append(t1)
    TrainLoss.append(trainloss.item()/n_batches)
    TrainAcc.append(trainacc.item()/n_batches)
    
    
    # preconditioner.lr = 0.01**(1/9)*preconditioner.lr
    # optimizer.param_groups[0]['lr'] = lr0
    testloss, testacc = test_loss_K(model)

    TestLoss.append(testloss)
    TestAcc.append(testacc)
    kbar.add(1, values=[("val_loss", testloss), ("val_acc", testacc)])
    # print('Epoch: {}; train loss: {}; test loss: {}, train_accuracy: {}, test_accuracy:{}, time: {}'\
    #  .format(epoch+1, TrainLoss[-1], TestLoss[-1], TrainAcc[-1], TestAcc[-1],np.sum(times)))

scipy.io.savemat(results_dir + 'DPSGD_M.mat', {'TrainLoss': TrainLoss, 'TestLoss': TestLoss, 'TrainAccuracy': TrainAcc,'TestAccuracy': TestAcc, 'Time':times})

Epoch: 1/20



This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.



  9/782 [..............................] - ETA: 16:34 - loss: 2.3034 - acc: 0.0972

KeyboardInterrupt: ignored

In [None]:
preconditioner.state_dict()['state'][0]

# ADAM

In [16]:
torch.manual_seed(1)
model = vgg11_bn().to(device)

loss_function = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

grad_clip = 0.1*np.sqrt(sum([W.numel() for W in model.parameters()]))


def test_loss_K(model):
    model.eval()
    loss = 0
    accuracy = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            y = model(data)
            loss += loss_function(y, target)
            _, pred = torch.max(y, dim=1)
            accuracy += (pred == target).sum(dtype=torch.float32)/pred.size(0)
    return loss.item()/n_test_batches, accuracy/n_test_batches

TrainLoss, TestLoss = [], []
TrainAcc, TestAcc = [], []
times = []

for epoch in range(EPOCHS):
    kbar = pkbar.Kbar(target=n_batches, epoch=epoch, num_epochs=EPOCHS, width=30, always_stateful=False, interval = 1)
    trainloss = 0.0
    trainacc = 0.0
    n = 0
    model.train()
    t0 = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):

        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        
        loss = loss_function(output, target)
        _, max_ind = torch.max(output, dim = 1)
        accuracy = (max_ind == target).sum(dtype=torch.float32)/max_ind.size(0) 

        trainloss += loss
        trainacc += accuracy

        loss.backward()
        optimizer.step()

        kbar.update(n, values=[("loss", loss.item()), ("acc", accuracy.item())])
        n += 1
        
    t1 = time.time() - t0
    times.append(t1)

    TrainLoss.append(trainloss.item()/n_batches)
    TrainAcc.append(trainacc.item()/n_batches)
    
    
    # lr0 = 0.01**(1/9)*lr0
    # optimizer.param_groups[0]['lr'] = lr0
    testloss, testacc = test_loss_K(model)

    TestLoss.append(testloss)
    TestAcc.append(testacc)
    kbar.add(1, values=[("val_loss", testloss), ("val_acc", testacc)])
    # print('Epoch: {}; train loss: {}; test loss: {}, train_accuracy: {}, test_accuracy:{}, time: {}'\
    #  .format(epoch+1, TrainLoss[-1], TestLoss[-1], TrainAcc[-1], TestAcc[-1],np.sum(times)))

scipy.io.savemat(results_dir + 'adam.mat', {'TrainLoss': TrainLoss, 'TestLoss': TestLoss, 'TrainAccuracy': TrainAcc,'TestAccuracy': TestAcc, 'Time':times})

Epoch: 1/20



This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.



Epoch: 2/20
Epoch: 3/20
Epoch: 4/20
Epoch: 5/20
Epoch: 6/20
 43/782 [>.............................] - ETA: 21s - loss: 0.3965 - acc: 0.8634       

KeyboardInterrupt: ignored

# Comparison

In [None]:
opts = ['sgd','adam','KFAC', 'shampoo','Kron','Kron_damped', 'SAMDPSGD']

total_train_time = {}
opts_data = {}
times = {}
train_times = {}
test_times = {}
train_losses = {}
test_losses = {}
train_accs = {}
test_accs = {}
train_err={}
test_err = {}


for opt in opts:
	opts_data[opt] = scipy.io.loadmat(results_dir+opt+'.mat')	

In [None]:
colors = ['#0000FF','#00FF00','#FF0000','#33F0FF','#FFA833','#000000','#33E0FF', '#FF33E6','#D433FF','#888A0B','#8A0B1E','#B498DF','#1B786D']
# colors = ['#0000FF','#00FF00','#FF0000','#33F0FF','#FFA833','#FFF933','#000000','#33E0FF','#FF33E6','#D433FF','#888A0B','#8A0B1E','#B498DF','#1B786D']

In [None]:
for opt in opts:
  # print(opt)
  data = opts_data[opt]
  times[opt] = data.get('Time')
  train_times[opt] = np.cumsum(times[opt])
  test_times[opt] = np.cumsum(times[opt])
  total_train_time[opt] = np.sum(times[opt])
  train_losses[opt] = data.get('TrainLoss').reshape(EPOCHS+1,)
  train_accs[opt] = data.get('TrainAccuracy').reshape(EPOCHS+1,)
  test_losses[opt] = data.get('TestLoss').reshape(EPOCHS+1,)
  test_accs[opt] = data.get('TestAccuracy').reshape(EPOCHS+1)
  train_err[opt] = (1-data.get('TrainAccuracy')).reshape(EPOCHS+1,)
  

for opt in ['kron_damped','SAMDPSGD']:
  test_err[opt] = (1-opts_data[opt].get('TestAccuracy')).reshape(EPOCHS+1,)

In [None]:

fig = go.Figure()
i = 0
for opt in ['kron', 'kron_damped','SAMDPSGD']:
  fig.add_trace(go.Scatter(y=test_err[opt], name = opt, mode='lines', line = dict(color = colors[i])))
  i = i + 1

fig.update_layout(title='test error', xaxis_title='epochs', yaxis_title='test error', yaxis_type="log")
fig.show()

In [None]:
# plot train_losses vs Iterations
plot_loss_metrics(None,train_losses,'Train Loss vs EPOCHS', 'EPOCHS','Train Loss')
# plot test_losses vs Iterations
plot_loss_metrics(None,test_losses,'Test Loss vs EPOCHS', 'EPOCHS','Test Loss')
# # plot test_losses vs Iterations
plot_loss_metrics(train_times,train_losses,'Train Loss vs Time', 'Time','Train Loss')
# plot test_losses vs Iterations
plot_loss_metrics(test_times,test_losses,'Test Loss vs Time', 'Time','Test Loss')

In [None]:
# plot train_losses vs Iterations
plot_loss_metrics(None,train_err,'Train Loss vs EPOCHS', 'EPOCHS','Train Loss')
# plot test_losses vs Iterations
# plot_loss_metrics(None,test_losses,'Test Loss vs EPOCHS', 'EPOCHS','Test Loss')
# # plot test_losses vs Iterations
plot_loss_metrics(train_times,train_err,'Train Loss vs Time', 'Time','Train Loss')
# plot test_losses vs Iterations
# plot_loss_metrics(test_times,test_losses,'Test Loss vs Time', 'Time','Test Loss')

In [None]:
# plot train_losses vs Iterations
plot_acc_metrics(None,train_accs,'Train Accuracy vs EPOCHS', 'EPOCHS','Train Accuracy')
# plot test_losses vs Iterations
plot_acc_metrics(None,test_accs,'Test Accuracy vs EPOCHS', 'EPOCHS','Test Accuracys')
# # plot test_losses vs Iterations
plot_acc_metrics(train_times,train_accs,'Train Accuracy vs Time', 'Time','Train Accuracy')
# plot test_losses vs Iterations
plot_acc_metrics(test_times,test_accs,'Test Accuracy vs Time', 'Time','Test Accuracy')