In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv("stl10_final.csv", header = None)

In [39]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27639,27640,27641,27642,27643,27644,27645,27646,27647,27648
0,146,143,110,146,143,110,146,143,110,146,...,138,127,119,147,136,122,138,128,93,1
1,129,140,73,124,133,68,138,144,84,144,...,146,128,94,151,130,91,194,164,123,0
2,179,223,114,163,203,104,165,191,103,89,...,124,149,128,114,140,117,104,131,109,1
3,14,18,29,16,21,35,14,21,41,12,...,125,80,18,123,80,18,120,80,18,0
4,147,130,91,200,189,164,127,112,89,68,...,147,131,89,145,129,93,155,138,108,0


In [7]:
import matplotlib.pyplot as plt

In [10]:
X = df.iloc[:, 0:-1].to_numpy()
Y = df.iloc[:, -1].to_numpy()

In [13]:
X = X / 256.0

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [57]:
EPOCHS = 3
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [17]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(torch.FloatTensor(X), torch.FloatTensor(Y))

In [18]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

In [26]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 27648.
        self.layer_1 = nn.Linear(27648, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [47]:
class BinaryClassification2(nn.Module):
    def __init__(self):
        super(BinaryClassification2, self).__init__()
        # Number of input features is 27648.
        self.layer_1 = nn.Linear(27648, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.relu(self.layer_2(x))
        x = self.layer_out(x)
        
        return x

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [65]:
# model = BinaryClassification()
model = BinaryClassification2()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = SGD_Simple(model.parameters(), lr = LEARNING_RATE)

BinaryClassification2(
  (layer_1): Linear(in_features=27648, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)
Using optimizer: SGD_Simple


In [66]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [67]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.56988 | Acc: 79.882
Epoch 002: | Loss: 0.53210 | Acc: 79.941
Epoch 003: | Loss: 0.52507 | Acc: 79.907


In [156]:
def group_product(xs, ys):
    
    return sum([torch.sum(x * y) for (x, y) in zip(xs, ys)])

def normalization(v):
    s = group_product(v, v)
    s = s**0.5
    s = s.cpu().item()
    v = [vi / (s + 1e-6) for vi in v]
    return v

In [200]:
class NysHessian():
    
    def __init__(self, rank, rho):
        self.rank = rank
        self.rho = rho
    
    def get_params_grad(self, model):
        params = []
        grads = []
        for param in model.parameters():
            if not param.requires_grad:
                continue
            params.append(param)
            grads.append(0. if param.grad is None else param.grad + 0.)
        return params, grads
    
    def update_Hessian(self, data_loader, model, criterion, device):
        
        params, gradsH = self.get_params_grad(model)
        self.size_vec = [p.size() for p in params]
        test_matrix = []
        hv_matrix = []
        
        for i in range(self.rank):
            
            v = [torch.randn(p.size()).to(device) for p in params]
            v = normalization(v)
            hv_add = [torch.zeros(p.size()).to(device) for p in params]
        
            # update hessian for full data or only a batch data?
            # use only one batch
            
            for X_batch, y_batch in data_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                model.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch.unsqueeze(1))
                loss.backward(create_graph=True)
                params, gradsH = self.get_params_grad(model)
                hv = torch.autograd.grad(gradsH, params, grad_outputs=v,only_inputs=True,retain_graph=True)
                for i in range(len(hv)):
                    hv_add[i].data = hv[i].data.add_(hv_add[i].data)    
            
            for i in range(len(hv_add)):
                hv_add[i].data = torch.div(hv_add[i].data, len(data_loader) * 1.0)
                hv_add[i].data = hv_add[i].data.add_(v[i].data * torch.tensor(0.1)) 
                # the shift step should be optimized, 
                # in numpy was using np.spacing, but not sure the counterpart in pytorch...
            
            hv_ex = torch.cat([gi.view(-1) for gi in hv_add])
            test_ex = torch.cat([gi.view(-1) for gi in v])
            
            hv_matrix.append(hv_ex)
            test_matrix.append(test_ex)
        
        # stack the groups of params into a very large matrix,
        # not sure if there's a way to avoid that....
        # but the storage is still the same
        
        # stack iteration by iteration, try 
        
        hv_matrix_ex = torch.column_stack(hv_matrix)
        test_matrix_ex = torch.column_stack(test_matrix)
        
        C_ex = torch.linalg.cholesky(torch.mm(test_matrix_ex.t(), hv_matrix_ex))
        B_ex = torch.linalg.solve_triangular(C_ex, hv_matrix_ex, upper = False, left = False)
        U, S, V = torch.linalg.svd(B_ex, full_matrices = False)
        self.U = U
        self.S = torch.max(torch.square(S) - torch.tensor(0.1), torch.tensor(0.0))
        

In [199]:
class NysHessianpartial():
    
    def __init__(self, rank, rho):
        self.rank = rank
        self.rho = rho
    
    def get_params_grad(self, model):
        params = []
        grads = []
        for param in model.parameters():
            if not param.requires_grad:
                continue
            params.append(param)
            grads.append(0. if param.grad is None else param.grad + 0.)
        return params, grads
    
    def update_Hessian(self, X_batch, y_batch, model, criterion, device):
        
        shift = 0.001
        params, gradsH = self.get_params_grad(model)
        self.size_vec = [p.size() for p in params]
        test_matrix = []
        hv_matrix = []
        
        for i in range(self.rank):
            
            v = [torch.randn(p.size()).to(device) for p in params]
            v = normalization(v)
            hv_add = [torch.zeros(p.size()).to(device) for p in params]
        
            # update hessian subsample a batch
            
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            model.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch.unsqueeze(1))
            loss.backward(create_graph=True)
            params, gradsH = self.get_params_grad(model)
            hv = torch.autograd.grad(gradsH, params, grad_outputs=v,only_inputs=True,retain_graph=True)
            for i in range(len(hv)):
                hv_add[i].data = hv[i].data.add_(hv_add[i].data)    
                hv_add[i].data = hv_add[i].data.add_(v[i].data * torch.tensor(shift)) 
                         
            hv_ex = torch.cat([gi.view(-1) for gi in hv_add])
            test_ex = torch.cat([gi.view(-1) for gi in v])
            
            hv_matrix.append(hv_ex)
            test_matrix.append(test_ex)
        
        hv_matrix_ex = torch.column_stack(hv_matrix)
        test_matrix_ex = torch.column_stack(test_matrix)
        choleskytarget = torch.mm(test_matrix_ex.t(), hv_matrix_ex)
        
        try:
            C_ex = torch.linalg.cholesky(choleskytarget)
        except:
            eigs, eigvectors = torch.linalg.eigh(C_ex)
            shift = shift + torch.abs(torch.min(L))
            eigs = eigs + shift
            C_ex = torch.linalg.cholesky(torch.mm(eigvectors, torch.mm(torch.diag(eigs), eigvectors.T)))

        B_ex = torch.linalg.solve_triangular(C_ex, hv_matrix_ex, upper = False, left = False)
        U, S, V = torch.linalg.svd(B_ex, full_matrices = False)
        self.U = U
        self.S = torch.max(torch.square(S) - torch.tensor(shift), torch.tensor(0.0))
        

In [None]:
# one hidden layer mlp check
# experiments, 
# randomized svd
# check kth eigenvalue to be positive, ApproxMinEvec in sketchy cgal paper
# when cholesky fails, add to the diagonal to make cholesky work
# if cholesky fails, doing eigen-decomposition of the cholesky matrix, figure out the smallest eigenvalue, that's what
# we want to add as shift + epsilon
# plot landscape and from origin to destination
# cifar10, or mnist may have better performance
# basic resnet, 
# smallest network that close to sota structures, concise net

In [None]:
nysh = NysHessian(2, 0.1)
model.train()
nysh.update_Hessian(train_loader, model, criterion, device)

In [189]:
from torch.optim import Optimizer

In [202]:
class NysHessianOpt(Optimizer):
    r"""Implements NysHessian.
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        rank (int): sketch rank
        rho: regularization
    """
    def __init__(self, params, lr, rank = 10, rho = 1.0):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
            
        defaults = dict(lr=lr, rank = rank, rho = rho)
        self.nysh = NysHessian(rank, rho)
        super(NysHessianOpt, self).__init__(params, defaults)
         
    def step(self):
        for group in self.param_groups:
            rho = group['rho']
            g = torch.cat([p.grad.view(-1) for p in group['params']])
            UTg = torch.mv(self.nysh.U.t(), g) 
            g_new = torch.mv(self.nysh.U, (self.nysh.S + rho).reciprocal() * UTg) + g / rho - torch.mv(self.nysh.U, UTg) / rho            
            ls = 0
            for p in group['params']:
                gp = g_new[ls:ls+torch.numel(p)].view(p.shape)
                ls += torch.numel(p)
                p.data.add_(-group['lr'] * gp)

In [203]:
# model = BinaryClassification()
model = BinaryClassification2()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = NysHessianOpt(model.parameters(), lr = LEARNING_RATE)

BinaryClassification2(
  (layer_1): Linear(in_features=27648, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)


In [204]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    optimizer.nysh.update_Hessian(train_loader, model, criterion, device)
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.56188 | Acc: 78.265
Epoch 002: | Loss: 0.53086 | Acc: 80.064
Epoch 003: | Loss: 0.52097 | Acc: 80.103


In [None]:
from torch.optim import Optimizer

# simple sgd optimizer, adapted from pytorch official implementation. 

class SGD_Simple(Optimizer):
    
    def __init__(self, params, lr, weight_decay=0.0):
        
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
            
        defaults = dict(lr=lr, weight_decay=weight_decay)
        
        super(SGD_Simple, self).__init__(params, defaults)

    def step(self):

        for group in self.param_groups:
            weight_decay = group['weight_decay']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                    
                p.data.add_(-group['lr'], d_p)