<a href="https://colab.research.google.com/github/sverdoot/optimizer-SUG-torch/blob/master/Linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import torch
from torch.optim import Optimizer
from torch import nn
from torch import functional as F
import torch.autograd
from torch.autograd import Variable
import math
from torch import optim

In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Optimization project")
from sug import *

os.getcwd()

file_path = "/content/drive/My Drive/Colab Notebooks/Optimization project/LINREG"
#directory = os.path.dirname(file_path)

try:
    os.stat(file_path)
except:
    os.mkdir(file_path)       

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device

device(type='cuda', index=0)

In [0]:
import torch
from torch.optim import Optimizer
import math
import copy

class SUG(Optimizer):
    def __init__(self, params, l_0, d_0=0, prob=1., eps=1e-4, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        if l_0 < 0.0:
            raise ValueError("Invalid Lipsitz constant of gradient: {}".format(l_0))
        if d_0 < 0.0:
            raise ValueError("Invalid disperion of gradient: {}".format(d_0))
        if momentum < 0.0:
            raise ValueError("Invalid momentum value: {}".format(momentum))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(L=l_0, momentum=momentum, dampening=dampening,
                        weight_decay=weight_decay, nesterov=nesterov)
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        self.Lips = l_0
        self.prev_Lips = l_0
        self.D_0 = d_0
        self.eps = eps
        self.prob = prob
        self.start_param = params
        self.upd_sq_grad_norm = None
        self.sq_grad_norm = None
        self.loss = torch.tensor(0.)
        self.cur_loss = 0
        self.closure = None
        super(SUG, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(SUG, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('nesterov', False)

    def comp_batch_size(self):
        """Returns optimal batch size for given d_0, eps and l_0;

        """
        return math.ceil(2 * self.D_0 * self.eps / self.prev_Lips)

    def step(self, loss, closure):
        """Performs a single optimization step.

        Arguments:
            loss : current loss

            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self.start_params = []
        self.loss = loss
        self.sq_grad_norm = 0
        self.cur_loss = loss
        self.closure = closure
        for gr_idx, group in enumerate(self.param_groups):
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']
            self.start_params.append([])
            for p_idx, p in enumerate(group['params']):
                self.start_params[gr_idx].append([p.data.clone()])
                if p.grad is None:
                    continue
                self.start_params[gr_idx][p_idx].append(p.grad.data.clone())
                d_p = self.start_params[gr_idx][p_idx][1]
                p_ = self.start_params[gr_idx][p_idx][0]
                
                
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                    self.cur_loss += weight_decay * torch.sum(p * p).item()
                   
                
                self.sq_grad_norm += torch.sum(d_p * d_p).item()
                
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                self.start_params[gr_idx][p_idx][1] = d_p
                
        i = 0
        self.Lips = max(self.prev_Lips / 2, 0.1)
        difference = -1
        while difference < 0 or i == 0:
            if (i > 0): 
                self.Lips = max(self.Lips * 2, 0.1)
            for gr_idx, group in enumerate(self.param_groups):
                for p_idx, p in enumerate(group['params']):
                    if p.grad is None:
                        continue
                    start_param_val = self.start_params[gr_idx][p_idx][0]
                    start_param_grad = self.start_params[gr_idx][p_idx][1]
                    p.data = start_param_val - 1/(2*self.Lips) * start_param_grad
            difference, upd_loss = self.stop_criteria()
            i += 1
        self.prev_Lips = self.Lips

        return self.Lips, i

    def stop_criteria(self):
        """Checks if the Lipsitz constant of gradient is appropriate
        
           <g(x_k), w_k - x_k> + 2L_k / 2 ||x_k - w_k||^2 = - 1 / (2L_k)||g(x_k)||^2 + 1 / (4L_k)||g(x_k)||^2 = -1 / (4L_k)||g(x_k)||^2                
        """
        upd_loss = self.closure()
        major =  self.cur_loss - 1 / (4 * self.Lips) * self.sq_grad_norm
        return major - upd_loss - self.l2_reg() + self.eps / 10, upd_loss

    def get_lipsitz_const(self):
        """Returns current Lipsitz constant of the gradient of the loss function
        """
        return self.Lips
    
    def get_sq_grad(self):
        """Returns the current second norm of the gradient of the loss function 
           calculated by the formula
           
           ||f'(p_1,...,p_n)||_2^2 ~ \sum\limits_{i=1}^n ((df/dp_i) * (df/dp_i))(p1,...,p_n))
           
        """
        self.upd_sq_grad_norm = 0
        for gr_idx, group in enumerate(self.param_groups):
            for p_idx, p in enumerate(group['params']):
                if p.grad is None:
                    continue
                self.upd_sq_grad_norm += torch.sum(p.grad.data * p.grad.data).item()
        
        return self.upd_sq_grad_norm
    
    def l2_reg(self):
        """Returns the current l2 regularization addiction
           
        """
        self.upd_l2_reg = 0
        for gr_idx, group in enumerate(self.param_groups):
            weight_decay = group['weight_decay']
            if weight_decay != 0:
                for p_idx, p in enumerate(group['params']):
                    self.upd_l2_reg += weight_decay * torch.sum(p * p).item()
        
        return self.upd_l2_reg

## Data

$x_i \sim \mathcal{N}(0, I),~~i=1..n,~~I\in \mathbb{R^{m^2}}$

$y_i = \theta^Tx_i + \epsilon,~~\epsilon \sim N(0,\sigma^2)$

$y = X \theta +\epsilon,~~\epsilon \sim \mathcal N(0,\Sigma)$

In [0]:
# compile data
torch.manual_seed(999)
n_rows = 1000000
n_columns = 100
X = torch.randn(n_rows, n_columns) 
true_weights = torch.ones(n_columns) * 5
y = X @ true_weights + torch.randn(n_rows) * 0.7

In [0]:
# split data
val_size = math.ceil(0.15 * X.size(0))
test_size = math.ceil(0.3 * X.size(0))
train_size = X.size(0) - val_size - test_size

X_train, y_train = X[: train_size], y[: train_size]
X_test, y_test = X[train_size : train_size + test_size], y[train_size : train_size + test_size]
X_val, y_val = X[- val_size :], y[- val_size :]

## Loss function

$$L(\theta, X) = \dfrac{1}{m}\sum\limits_{i=1}^m | x_i\theta -  y_i|^2 = \dfrac{1}{m}\|X\theta- y\|_2^2$$

$$\nabla_{\theta}L(\theta,X) = \dfrac{2}{m}\sum\limits_{i=1}^m x_i^T (x_i \theta-y_i) $$

The Lipsitz constant of the gradient may be determined straight from the definition :

$$\|\nabla_{\theta_1}L(\theta_1,X)-\nabla_{\theta_2}L(\theta_2,X)\| _2 \le L\|\theta_1 - \theta_2\|_2, ~~~\forall \theta_1,\theta_2 \in \mathbb{R^m}$$

$$\|\nabla_{\theta_1}L(\theta_1,X)-\nabla_{\theta_2}L(\theta_2,X)\| _2 = 
\dfrac{2}{m}\|\sum\limits_{i=1}^m x_i^T (x_i (\theta_1-\theta_2))\|_2 \le \dfrac{2}{m}\|\sum\limits_{i=1}^m x_i^T x_i\|_2 \|\theta_1-\theta_2\|_2$$

$$\Rightarrow  L \le \dfrac{2}{m}\|\sum\limits_{i=1}^m x_i^T x_i\|_2$$
 
$$As ~\sum\limits_{i=1}^m x_i^T x_i~~ is~ simmetric, $$
 
$$\|\sum\limits_{i=1}^m x_i^T x_i\|_2 = \lambda_{max}\left(\sum\limits_{i=1}^mx_i^T x_i\right)$$
 
Also it is possible to find L from the following statement:

$$\|\nabla_\theta^2 L(\theta, X)\|_2^2  = \lambda_{max}\left(\nabla_\theta^2 L(\theta, X)\right) \le L,~~~\forall \theta \in \mathbb{R^m}$$ 

$$\nabla_{\theta}^2 L(\theta,X) = \dfrac{2}{m}\sum\limits_{i=1}^mx_i^T x_i$$


Maximum eigen value of the matrix $A$ can be found with the following itrative method:

$$y_{i+1} = A x_i$$
$$x_{i+1} = y_{i+1} / \|y_{i+1}\|_2$$
$$i = i+1$$
 
 
As $x_i$ is a normalized real vector when $i>0$,    $~~~\lambda_i = x_i^*Ax_i = x_i^T A x_i$. 

The one should repeat the iteration until convergence of $\lambda$.
 
 

In [0]:
criterion = nn.MSELoss(reduction='mean')

In [0]:
# estimate the Lipsitz constant of the gradient

def mse_hessian(X):
    hess = 0
    for x in X:
        hess += 2 / X.size(0) * torch.ger(x, x)
    return hess    
  
hess = mse_hessian(X_train)

In [0]:
def max_eigval(A, x=None, eps=1e-7):
    if x is None:
        x = torch.randn(A.size(1))
    lam = 1
    dif = 1
    while (dif > eps):
        lam_prev = lam
        y = A @ x
        x = y / torch.norm(y, 2)
        lam = (x @ A @ x).item()
        dif = np.abs(lam - lam_prev)
    return lam

In [0]:
L = max_eigval(hess)
print("the Lipsitz constant of the gradient does not exceed {}".format(- math.floor(- L * 10 ** 3) / 10 ** 3))

the Lipsitz constant of the gradient does not exceed 2.053


## Experiments

In [0]:
class LinearRegressionModel(nn.Module):
    def __init__(self, n_columns):
        super(LinearRegressionModel, self).__init__()
        self.weights = nn.Parameter(torch.randn(n_columns))
    
    def forward(self, x):
        x = x @ self.weights
        return x

In [0]:
def train(model, optimizer, X_train, y_train, n_epochs=1, batch_size=4, print_every=1, X_val=None, y_val=None):
    model.to(device)
    tr_losses, val_losses, lips, grad = ([] for i in range(4))
    batch_per_ep = X_train.size(0) // batch_size
    for ep in range(n_epochs):
        model.train()
        for i in range(batch_per_ep):
            inputs, y = Variable(X_train[i*batch_size:(i+1)*batch_size]).to(device), Variable(y_train[i*batch_size:(i+1)*batch_size]).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, y)
            tr_losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            if optimizer.__class__.__name__ != 'SUG':
                optimizer.step()
            else:
                def closure():
                    optimizer.zero_grad()
                    upd_outputs = model(inputs)
                    upd_loss = criterion(upd_outputs, y)
                    return upd_loss.item()
                _, _ = optimizer.step(loss.item(), closure)
                lips.append(optimizer.get_lipsitz_const())
                grad.append(optimizer.get_sq_grad)
                
            model.zero_grad()
            model.eval()
            X, y = Variable(X_val).to(device), Variable(y_val).to(device)
            outputs = model(X)
            loss = criterion(outputs, y) 
            val_losses.append(loss.item())
            model.train()
    if ep % print_every == 0:
        print("Epoch: {}, training loss: {}, validation loss: {}".format(ep, sum(tr_losses[-batch_per_ep:])/batch_per_ep, val_losses[-1]))
    
    return tr_losses, val_losses, lips, grad            

In [0]:
lrs = [0.5, 0.1, 0.01]
n_epochs = 1
tr_loss = {}
tr_loss['sgd'] = {}
val_loss = {}
val_loss['sgd'] = {}
criterion = nn.MSELoss(reduction="mean")


In [0]:
for lr in lrs:
    print("SGD lr={} momentum 0. weight_decay 1e-3".format(lr))
    model = LinearRegressionModel(n_columns)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0., weight_decay=1e-3)
    tr_loss['sgd'][lr], val_loss['sgd'][lr], lips, grad = train(model, optimizer, X_train, y_train, n_epochs=n_epochs, batch_size=512, X_val=X_val, y_val=y_val, print_every=1)
    print("Test score: {:.4}\n\n".format(criterion(model(X_test.to(device)), y_test.to(device)).item()))
    states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad
        }
    torch.save(states, './LINREG/lr_'+str(lr))

SGD lr=0.5 momentum 0. weight_decay 1e-3
Epoch: 0, training loss: 3.3324894935059146, validation loss: 0.663571298122406
Test score: 0.6624


SGD lr=0.1 momentum 0. weight_decay 1e-3
Epoch: 0, training loss: 7.515337376121702, validation loss: 0.5041121244430542
Test score: 0.5042


SGD lr=0.01 momentum 0. weight_decay 1e-3
Epoch: 0, training loss: 60.27786558996921, validation loss: 0.4917539060115814
Test score: 0.4924




In [0]:
print("SUG l_0=2.1 weight_decay=1e-3")
model = LinearRegressionModel(n_columns)
sgd = SUG(model.parameters(), l_0=2.1, weight_decay=1e-3)
tr_loss['sug'], val_loss['sug'], lips, grad = train(model, sgd, X_train, y_train, n_epochs=1, batch_size=512, X_val=X_val, y_val=y_val, print_every=1)
#print("\nTrained weights: \n{}\n".format(list(model.parameters())[0].cpu().detach().numpy()))
print("Test score: {:.4}".format(criterion(model(X_test.to(device)), y_test.to(device)).item()))
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sug'],
            'val_loss' : val_loss['sug'],
            'lips' : lips,
            'grad' : grad
        }
torch.save(states, './LINREG/lr_sug')

SUG l_0=2.1 weight_decay=1e-3
Epoch: 0, training loss: 3.625562452681682, validation loss: 0.5332871675491333
Test score: 0.5335


In [0]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print("Trained weights:\n{}\n".format(list(lr_model.coef_)))

print("{:.4}".format(criterion(torch.tensor(lr_model.predict(X_test)), y_test)))

Trained weights:
[5.000724, 4.9999266, 5.0013065, 4.998826, 4.999601, 5.0004044, 4.999807, 5.0003324, 4.9998713, 5.000539, 5.000581, 4.999488, 5.0012116, 4.999505, 5.0008817, 5.001804, 5.000177, 4.9986477, 5.000984, 4.9998875, 4.9995165, 5.000615, 4.9999037, 5.0002794, 4.99903, 4.999418, 4.9999304, 5.00036, 4.999962, 5.0000887, 5.000258, 5.0010433, 4.998993, 4.998514, 5.001095, 5.0027943, 4.9993105, 4.9999247, 4.9977074, 4.9997997, 4.999658, 5.001179, 4.9991293, 5.0015984, 5.0008783, 5.000133, 5.0004673, 5.000086, 4.9992847, 4.9987707, 5.000008, 5.001558, 4.9988203, 5.000719, 4.999448, 5.001664, 5.0009217, 4.999479, 4.999565, 4.999815, 4.998951, 5.000894, 5.0004435, 5.0008864, 4.998207, 5.0011196, 5.0005913, 4.999487, 5.0000505, 5.000177, 5.000865, 4.998932, 4.997824, 4.99936, 4.998658, 5.000172, 5.0004015, 4.9999013, 4.9979987, 5.001012, 5.00029, 4.999296, 5.00208, 4.99966, 5.0009828, 4.9990106, 4.9997272, 4.998647, 4.9997196, 5.0009317, 5.001313, 4.999817, 4.9982986, 5.0004134, 4.999