<a href="https://colab.research.google.com/github/sverdoot/optimizer-SUG-torch/blob/master/Linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import torch
from torch.optim import Optimizer
from torch import nn
from torch import functional as F
import torch.autograd
from torch.autograd import Variable
import math
from torch import optim

In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Optimization project")
from sug import *

os.getcwd()

file_path = "/content/drive/My Drive/Colab Notebooks/Optimization project/LINREG"
#directory = os.path.dirname(file_path)

try:
    os.stat(file_path)
except:
    os.mkdir(file_path)       

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

device

device(type='cuda', index=0)

## Data

$x_i \sim \mathcal{N}(0, I),~~i=1..n,~~I\in \mathbb{R^{m^2}}$

$y_i = \theta^Tx_i + \epsilon,~~\epsilon \sim N(0,\sigma^2)$

$y = X \theta +\epsilon,~~\epsilon \sim \mathcal N(0,\Sigma)$

In [0]:
# compile data
n_rows = 1000000
n_columns = 100
X = torch.randn(n_rows, n_columns) 
true_weights = torch.ones(n_columns) * 5
y = X @ true_weights + torch.randn(n_rows) * 0.3

In [0]:
# split data
val_size = math.ceil(0.15 * X.size(0))
test_size = math.ceil(0.3 * X.size(0))
train_size = X.size(0) - val_size - test_size

X_train, y_train = X[: train_size], y[: train_size]
X_test, y_test = X[train_size : train_size + test_size], y[train_size : train_size + test_size]
X_val, y_val = X[- val_size :], y[- val_size :]

## Loss function

$$L(\theta, X) = \dfrac{1}{m}\sum\limits_{i=1}^m | x_i\theta -  y_i|^2 = \dfrac{1}{m}\|X\theta- y\|_2^2$$

$$\nabla_{\theta}L(\theta,X) = \dfrac{2}{m}\sum\limits_{i=1}^m x_i^T (x_i \theta-y_i) $$

The Lipsitz constant of the gradient may be determined straight from the definition :

$$\|\nabla_{\theta_1}L(\theta_1,X)-\nabla_{\theta_2}L(\theta_2,X)\| _2 \le L\|\theta_1 - \theta_2\|_2, ~~~\forall \theta_1,\theta_2 \in \mathbb{R^m}$$

$$\|\nabla_{\theta_1}L(\theta_1,X)-\nabla_{\theta_2}L(\theta_2,X)\| _2 = 
\dfrac{2}{m}\|\sum\limits_{i=1}^m x_i^T (x_i (\theta_1-\theta_2))\|_2 \le \dfrac{2}{m}\|\sum\limits_{i=1}^m x_i^T x_i\|_2 \|\theta_1-\theta_2\|_2$$

$$\Rightarrow  L \le \dfrac{2}{m}\|\sum\limits_{i=1}^m x_i^T x_i\|_2$$
 
$$As ~\sum\limits_{i=1}^m x_i^T x_i~~ is~ simmetric, $$
 
$$\|\sum\limits_{i=1}^m x_i^T x_i\|_2 = \lambda_{max}\left(\sum\limits_{i=1}^mx_i^T x_i\right)$$
 
Also it is possible to find L from the following statement:

$$\|\nabla_\theta^2 L(\theta, X)\|_2^2  = \lambda_{max}\left(\nabla_\theta^2 L(\theta, X)\right) \le L,~~~\forall \theta \in \mathbb{R^m}$$ 

$$\nabla_{\theta}^2 L(\theta,X) = \dfrac{2}{m}\sum\limits_{i=1}^mx_i^T x_i$$


In [0]:
criterion = nn.MSELoss(reduction='mean')

In [15]:
# estimate the Lipsitz constant of the gradient

def mse_hessian(X):
    hess = 0
    for x in X:
        hess += 2 / X.size(0) * torch.ger(x, x)
    return hess    
  
hess = mse_hessian(X_train)
L = np.max(np.linalg.eigvals(hess.numpy())) + 1e-2

print("the Lipsitz constant of the gradient does not exceed {:.3}".format(L))

the Lipsitz constant of the gradient does not exceed 2.06


## Experiments

In [0]:
class LinearRegressionModel(nn.Module):
    def __init__(self, n_columns):
        super(LinearRegressionModel, self).__init__()
        self.weights = nn.Parameter(torch.randn(n_columns))
    
    def forward(self, x):
        x = x @ self.weights
        return x

In [0]:
def train(model, optimizer, X_train, y_train, n_epochs=1, batch_size=4, print_every=1, X_val=None, y_val=None):
    model.to(device)
    tr_losses, val_losses, lips, grad = ([] for i in range(4))
    batch_per_ep = X_train.size(0) // batch_size
    for ep in range(n_epochs):
        model.train()
        for i in range(batch_per_ep):
            inputs, y = Variable(X_train[i*batch_size:(i+1)*batch_size]).to(device), Variable(y_train[i*batch_size:(i+1)*batch_size]).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, y)
            tr_losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            if optimizer.__class__.__name__ != 'SUG':
                optimizer.step()
            else:
                def closure():
                    optimizer.zero_grad()
                    upd_outputs = model(inputs)
                    upd_loss = criterion(upd_outputs, y) 
                    upd_loss.backward()
                    return upd_loss
                optimizer.step(loss, closure)
                lips.append(optimizer.get_lipsitz_const())
                grad.append(optimizer.get_sq_grad)
                

        model.zero_grad()
        model.eval()
        X, y = Variable(X_val).to(device), Variable(y_val).to(device)
        outputs = model(X)
        loss = criterion(outputs, y) 
        val_losses.append(loss.item())
        if ep % print_every == 0:
            print("Epoch: {}, training loss: {}, validation loss: {}".format(ep, sum(tr_losses[-batch_per_ep:])/batch_per_ep, val_losses[-1]))
        
    return tr_losses, val_losses, lips, grad            

In [0]:
lrs = [0.1, 0.01, 0.001]
n_epochs = 1
tr_loss = {}
tr_loss['sgd'] = {}
val_loss = {}
val_loss['sgd'] = {}
criterion = nn.MSELoss(reduction="mean")

In [45]:
for lr in lrs:
    print("SGD lr={}".format(lr))
    model = LinearRegressionModel(n_columns)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    tr_loss['sgd'][lr], val_loss['sgd'][lr], lips, grad = train(model, optimizer, X_train, y_train, n_epochs=n_epochs, batch_size=256, X_val=X_val, y_val=y_val, print_every=1)
    #print("\nTrained weights: \n{}\n".format(list(model.parameters())[0].cpu().detach().numpy()))
    print("Test score: {:.4}\n\n".format(criterion(model(X_test.to(device)), y_test.to(device)).item()))
    states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sgd'][lr],
            'val_loss' : val_loss['sgd'][lr],
            'lips' : lips,
            'grad' : grad
        }
    torch.save(states, './LINREG/lr_'+str(lr))

SGD lr=0.1
Epoch: 0, training loss: 3.692259731586308, validation loss: 0.09422596544027328
Test score: 0.09341


SGD lr=0.01
Epoch: 0, training loss: 3.5720966252215636, validation loss: 0.09422600269317627
Test score: 0.09341


SGD lr=0.001
Epoch: 0, training loss: 3.57254397454264, validation loss: 0.09422596544027328
Test score: 0.09341




In [54]:
print("ADAM")
lr = 0.01
model = LinearRegressionModel(n_columns)
sgd = optim.Adam(model.parameters(), lr=lr)
tr_loss['adam'], val_loss['adam'], lips, grad = train(model, sgd, X_train, y_train, n_epochs=1, batch_size=16, X_val=X_val, y_val=y_val, print_every=1)
#print("\nTrained weights: \n{}\n".format(list(model.parameters())[0].cpu().detach().numpy()))
print("Test score: {:.4}".format(criterion(model(X_test.to(device)), y_test.to(device)).item()))
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['adam'],
            'val_loss' : val_loss['adam'],
            'lips' : lips,
            'grad' : grad
        }
torch.save(states, './LINREG/lr_adam_' + str(lr))

ADAM
Epoch: 0, training loss: 46.06552288995353, validation loss: 0.1441994607448578
Test score: 0.1431


In [48]:
print("SUG")
model = LinearRegressionModel(n_columns)
sgd = SUG(model.parameters(), l_0=8)
tr_loss['sug'], val_loss['sug'], lips, grad = train(model, sgd, X_train, y_train, n_epochs=1, batch_size=16, X_val=X_val, y_val=y_val, print_every=1)
#print("\nTrained weights: \n{}\n".format(list(model.parameters())[0].cpu().detach().numpy()))
print("Test score: {:.4}".format(criterion(model(X_test.to(device)), y_test.to(device)).item()))
states = {
            'epoch': n_epochs,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'tr_loss' : tr_loss['sug'],
            'val_loss' : val_loss['sug'],
            'lips' : lips,
            'grad' : grad
        }
torch.save(states, './LINREG/lr_sug')

SUG
Epoch: 0, training loss: 0.8374266967736591, validation loss: 0.13306814432144165
Test score: 0.1326


In [0]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print("Trained weights:\n{}\n".format(list(lr_model.coef_)))

print("{:.4}".format(criterion(torch.tensor(lr_model.predict(X_test)), y_test)))

Trained weights:
[5.0007753, 4.999613, 4.9998965, 5.000149, 5.0002346, 4.9998903, 5.0000505, 5.000227, 4.9995956, 4.999251, 5.000267, 4.999912, 5.000618, 5.000696, 5.00017, 4.9995766, 5.0010047, 5.0004873, 4.999726, 5.0000296, 4.9996996, 4.99969, 5.0003996, 5.0006623, 4.999458, 4.9998875, 5.0000353, 5.0003586, 4.999851, 5.000312, 4.999714, 4.9994636, 4.9998403, 5.0004435, 4.999973, 5.0003233, 5.0004697, 4.9998035, 4.9999256, 4.9999447, 5.000782, 5.0002737, 5.000174, 5.0003986, 5.0002728, 4.999854, 5.000162, 4.999118, 4.999841, 5.0001917, 5.0002103, 4.999758, 5.0000906, 5.00067, 4.9996786, 5.0003104, 5.000347, 4.999529, 5.000384, 4.999915, 4.99949, 5.000249, 4.999715, 5.000318, 4.9993095, 4.999655, 4.9990406, 4.999196, 5.000603, 5.000493, 5.000471, 4.999602, 4.9997187, 4.99999, 5.0003824, 4.9997807, 5.000211, 4.999608, 5.0002136, 4.9993877, 4.9996705, 5.0005617, 4.998989, 4.999974, 5.0003223, 4.9996195, 5.0010633, 5.000208, 5.0004177, 5.000323, 4.9999447, 4.9996486, 4.999584, 4.9999084,