In [1]:
import torch
from torch import nn
import numpy as np
import tabulate
from sklearn.decomposition import PCA
import os 

os.chdir('/home/tristan/loss-subspace-geometry-project/loss-subspace-geometry/src')


from models.mlp import SubspaceNN, NN, NonLinearSubspaceNN
from models.subspace_layers import LinesNN

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torchvision
import torchvision.transforms as transforms

In [11]:
# configs
data_dim = 784
hidden_size = 512
out_dim = 10
dropout_prob = 0.3
seed = 601
train_beta = 0.5
vanilla_seed = 450
# device = torch.device('cuda')
device = torch.device('cuda:2')

model_path = f'/home/tristan/loss-subspace-geometry-project/loss-subspace-geometry-save/models/nonlinear_subspace_vanilla_mlp_seed_{seed}_beta_{train_beta}_0.pt'

curve_model = NonLinearSubspaceNN(input_dim=data_dim, 
                         hidden_dim=hidden_size, 
                         out_dim=out_dim, 
                         dropout_prob=dropout_prob, 
                         seed=seed).to(device)

for tuple in curve_model.state_dict():
    print(tuple)

checkpoint = torch.load(model_path)
for tuple in checkpoint:
    print(tuple)
curve_model.load_state_dict(checkpoint, strict=True)

mlp.linear.weight
mlp.linear.bias
mlp.linear.line.parameterization_linear_1.weight
mlp.linear.line.parameterization_linear_1.bias
mlp.linear.line.parameterization_linear_2.weight
mlp.linear.line.parameterization_linear_2.bias
out.weight
out.bias
out.line.parameterization_linear_1.weight
out.line.parameterization_linear_1.bias
out.line.parameterization_linear_2.weight
out.line.parameterization_linear_2.bias
mlp.linear.weight
mlp.linear.bias
mlp.linear.line.parameterization_linear_1.weight
mlp.linear.line.parameterization_linear_1.bias
mlp.linear.line.parameterization_linear_2.weight
mlp.linear.line.parameterization_linear_2.bias
out.weight
out.bias
out.line.parameterization_linear_1.weight
out.line.parameterization_linear_1.bias
out.line.parameterization_linear_2.weight
out.line.parameterization_linear_2.bias


<All keys matched successfully>

In [12]:
# more configs

grid_points = 15
margin_left = 0.2
margin_right = 0.2
margin_bottom = 0.2
margin_top = 0.2

In [13]:
curve_model

NonLinearSubspaceNN(
  (mlp): NonLinearSubspaceMLP(
    (linear): LinesNN(
      in_features=784, out_features=512, bias=True
      (line): ParameterizedSubspace(
        (parameterization_linear_1): Linear(in_features=1, out_features=10, bias=True)
        (parameterization_linear_2): Linear(in_features=10, out_features=401408, bias=True)
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (out): LinesNN(
    in_features=512, out_features=10, bias=True
    (line): ParameterizedSubspace(
      (parameterization_linear_1): Linear(in_features=1, out_features=10, bias=True)
      (parameterization_linear_2): Linear(in_features=10, out_features=5120, bias=True)
    )
  )
)

In [14]:
curve_parameters = list(curve_model.parameters())
w = []

# actual neural net
w.append(np.concatenate([
        # weights layer 1, biases layer 1, weights layer 2, biases layer 2
        p.data.cpu().numpy().ravel() for p in [curve_parameters[0], curve_parameters[1], curve_parameters[6], curve_parameters[7]]
    ]))

# subspace network 1 parameters
w.append(np.concatenate([
        # weights layer 1, biases layer 1, weights layer 2, biases layer 2
        p.data.cpu().numpy().ravel() for p in [curve_parameters[2], curve_parameters[3], curve_parameters[4], curve_parameters[5]]
    ]))

# subspace network 2 parameters
w.append(np.concatenate([
        # weights layer 1, biases layer 1, weights layer 2, biases layer 2
        p.data.cpu().numpy().ravel() for p in [curve_parameters[8], curve_parameters[9], curve_parameters[10], curve_parameters[11]]
]))

In [17]:
isolated_model = NN(input_dim=data_dim, 
                         hidden_dim=hidden_size, 
                         out_dim=out_dim, 
                         dropout_prob=dropout_prob).to(device)
isolated_checkpoint = torch.load(f'/home/tristan/loss-subspace-geometry-project/loss-subspace-geometry-save/models/vanilla_mlp_seed_{vanilla_seed}_0.pt')
isolated_model.load_state_dict(isolated_checkpoint)

<All keys matched successfully>

In [18]:
isolated_model

NN(
  (mlp): MLP(
    (linear): Linear(in_features=784, out_features=512, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (out): Linear(in_features=512, out_features=10, bias=True)
)

In [19]:
isolated_weights = np.concatenate([
        p.data.cpu().numpy().ravel() for p in list(isolated_model.parameters())
    ])

In [20]:
def sample_subspace(alpha, subspace_net_1, subspace_net_2):
    alpha = torch.tensor([float(alpha)])
    setattr(subspace_net_1, 'alpha', alpha)
    setattr(subspace_net_2, 'alpha', alpha)
    w1 = subspace_net_1.get_weight().clone().detach()
    b1 = curve_parameters[1].clone().detach().cpu()
    w2 = subspace_net_2.get_weight().clone().detach()
    b2 = curve_parameters[7].clone().detach().cpu()
    weights = torch.cat([w1,b1,w2,b2]).numpy()
    return weights

In [21]:
## Sample uniformly from subspace ##
samples = 1000
alphas = np.linspace(0, 1, samples)

# instantiate both subspace networks
subspace_net_1 = LinesNN(
    in_features=data_dim,
    out_features=hidden_size
)

subspace_net_2 = LinesNN(
    in_features=hidden_size,
    out_features=out_dim
)

# fetch weights from trained subspace networks
subspace_net_1_weights = [curve_parameters[2], curve_parameters[3], curve_parameters[4], curve_parameters[5]]
subspace_net_2_weights = [curve_parameters[8], curve_parameters[9], curve_parameters[10], curve_parameters[11]]

# set subspace network weights
with torch.no_grad():
    subspace_net_1.line.parameterization_linear_1.weight.copy_(curve_parameters[2])
    subspace_net_1.line.parameterization_linear_1.bias.copy_(curve_parameters[3])
    subspace_net_1.line.parameterization_linear_2.weight.copy_(curve_parameters[4])
    subspace_net_1.line.parameterization_linear_2.bias.copy_(curve_parameters[5])

    subspace_net_2.line.parameterization_linear_1.weight.copy_(curve_parameters[8])
    subspace_net_2.line.parameterization_linear_1.bias.copy_(curve_parameters[9])
    subspace_net_2.line.parameterization_linear_2.weight.copy_(curve_parameters[10])
    subspace_net_2.line.parameterization_linear_2.bias.copy_(curve_parameters[11])

# sample points in weight space
weight_space_points = []
for alpha in alphas:
    weights = sample_subspace(alpha, subspace_net_1, subspace_net_2)
    weight_space_points.append(weights)

weight_space_points = np.array(weight_space_points)


In [22]:
# save weight space samples
np.savez(f'subspace_samples_seed_{seed}_beta_{train_beta}.npz', weight_space_points)

In [14]:
# Perform PCA on sampled weight space points to get top two principle eigenvectors
# weight_space_points = np.asarray(weight_space_points)
# pca = PCA().fit(weight_space_points)

In [23]:
# calculate corners of hyperplane via projection onto principal vecs
origin = sample_subspace(0.0, subspace_net_1, subspace_net_2)
c3 = sample_subspace(1.0, subspace_net_1, subspace_net_2)

In [24]:

# set up for grid for plane plotting

def get_xy(point, origin, vector_x, vector_y):
    return np.array([np.dot(point - origin, vector_x), np.dot(point - origin, vector_y)])


print('Weight space dimensionality: %d' % w[0].shape[0])

u = c3-origin
dx = np.linalg.norm(u)
u /= dx

v = isolated_weights-origin
dy = np.linalg.norm(v)
v /= dy


Weight space dimensionality: 407050


In [25]:
w_corners = [origin, c3, isolated_weights]


In [26]:
bend_coordinates = np.stack(get_xy(p, origin, u, v) for p in w_corners)
curve_coordinates = np.stack(get_xy(p, origin, u, v) for p in weight_space_points)

  if await self.run_code(code, result, async_=asy):


In [27]:
def get_weights(model: nn.Module, t):
    weights = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and 'parameterization' not in name:
            # add attribute for weight dimensionality and subspace dimensionality
            setattr(module, f'alpha', torch.tensor([t], dtype=torch.float32, device=device))
            print(module.get_weight())
            weights.extend([module.get_weight(), module.bias.data])
        # weights.extend([w for w in module.compute_weights_t(coeffs_t) if w is not None])
    return np.concatenate([w.detach().cpu().numpy().ravel() for w in weights])


In [28]:
G = grid_points
alphas = np.linspace(0.0 - margin_left, 1.0 + margin_right, G)
betas = np.linspace(0.0 - margin_bottom, 1.0 + margin_top, G)

tr_loss = np.zeros((G, G))
tr_nll = np.zeros((G, G))
tr_acc = np.zeros((G, G))
tr_err = np.zeros((G, G))

te_loss = np.zeros((G, G))
te_nll = np.zeros((G, G))
te_acc = np.zeros((G, G))
te_err = np.zeros((G, G))

grid = np.zeros((G, G, 2))

In [29]:
# even more configs for evaluating on FashionMNIST
data_dir = '/home/tristan/loss-subspace-geometry-project/data/'
batch_size = 128

In [30]:
transform = transforms.Compose([transforms.ToTensor()])
FashionMNIST_data_train = torchvision.datasets.FashionMNIST(
    data_dir, train=True, transform=transform, download=False)

train_set, val_set = torch.utils.data.random_split(
    FashionMNIST_data_train, [50000, 10000])
train_loader = torch.utils.data.DataLoader(
    train_set, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
    val_set, batch_size=len(val_set), shuffle=False)

In [31]:
criterion = nn.CrossEntropyLoss(reduction='sum')

In [32]:
def eval(model: nn.Module, loader):
    running_loss = 0.0
    num_right = 0

    model.eval()

    for i, (x, y) in enumerate(loader):
            reshaped_x = x.reshape(x.size(0), 784)
            y_hat = model(reshaped_x.to(device))
            num_right += torch.sum(
                y.to(device) == torch.argmax(
                    y_hat, dim=-1)).detach().cpu().item()

            running_loss += criterion(y_hat, y.to(device)).item()
    return {
        'nll': running_loss / len(loader.dataset),
        'loss': running_loss / len(loader.dataset),
        'accuracy': num_right * 100.0 / len(loader.dataset),
    }

In [33]:
dxy_scale = 15
base_model =  NN(input_dim=data_dim, 
                         hidden_dim=hidden_size, 
                         out_dim=out_dim, 
                         dropout_prob=dropout_prob).to(device)
print(curve_coordinates)
columns = ['X', 'Y', 'Train loss', 'Train nll', 'Train error (%)', 'Test nll', 'Test error (%)']


for i, alpha in enumerate(alphas):
    for j, beta in enumerate(betas):
        p = np.array(origin + alpha * dxy_scale*dx * u + beta * dxy_scale*dy * v)
        offset = 0
        for parameter in base_model.parameters():
            size = np.prod(parameter.size())
            value = p[offset:offset+size].reshape(parameter.size())
            parameter.data.copy_(torch.from_numpy(value)).to(device)
            offset += size

        # tr_res = utils.test(loaders['train'], base_model, criterion, regularizer)
        # te_res = utils.test(loaders['test'], base_model, criterion, regularizer)
        tr_res = eval(model=base_model, loader=train_loader)
        te_res = eval(model=base_model, loader=valid_loader)


        tr_loss_v, tr_nll_v, tr_acc_v = tr_res['loss'], tr_res['nll'], tr_res['accuracy']
        te_loss_v, te_nll_v, te_acc_v = te_res['loss'], te_res['nll'], te_res['accuracy']

        c = get_xy(p, origin, u, v)
        grid[i, j] = [alpha * dxy_scale * dx, beta * dxy_scale * dy]

        tr_loss[i, j] = tr_loss_v
        tr_nll[i, j] = tr_nll_v
        tr_acc[i, j] = tr_acc_v
        tr_err[i, j] = 100.0 - tr_acc[i, j]

        te_loss[i, j] = te_loss_v
        te_nll[i, j] = te_nll_v
        te_acc[i, j] = te_acc_v
        te_err[i, j] = 100.0 - te_acc[i, j]

        values = [
            grid[i, j, 0], grid[i, j, 1], tr_loss[i, j], tr_nll[i, j], tr_err[i, j],
            te_nll[i, j], te_err[i, j]
        ]
        table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='10.4f')
        if j == 0:
            table = table.split('\n')
            table = '\n'.join([table[1]] + table)
        else:
            table = table.split('\n')[2]
        print(table)

np.savez(
    os.path.join('./', f'plane_nonlinear_subspace_edges_isolated_model_dxscale{dxy_scale}_dyscale{dxy_scale}_seed_{seed}_vanillaseed_{vanilla_seed}_beta_{train_beta}.npz'),
    bend_coordinates=bend_coordinates,
    curve_coordinates=curve_coordinates,
    alphas=alphas,
    betas=betas,
    grid=grid,
    tr_loss=tr_loss,
    tr_acc=tr_acc,
    tr_nll=tr_nll,
    tr_err=tr_err,
    te_loss=te_loss,
    te_acc=te_acc,
    te_nll=te_nll,
    te_err=te_err
)

[[0.0000000e+00 0.0000000e+00]
 [2.4687190e-02 2.3112601e-02]
 [4.9760334e-02 4.6227191e-02]
 ...
 [9.8478271e+01 4.5640114e+01]
 [9.8612396e+01 4.5695404e+01]
 [9.8745560e+01 4.5749802e+01]]
----------  ----------  ------------  -----------  -----------------  ----------  ----------------
         X           Y    Train loss    Train nll    Train error (%)    Test nll    Test error (%)
----------  ----------  ------------  -----------  -----------------  ----------  ----------------
 -296.2365   -309.7039      173.7082     173.7082            55.4880    178.4916           56.1700
 -296.2365   -154.8519       38.4690      38.4690            47.4920     39.5906           48.6100
 -296.2365     -0.0000        0.4384       0.4384            15.3320      0.4479           15.8500
 -296.2365    154.8519        0.3477       0.3477            10.1200      0.3555           10.3800
 -296.2365    309.7039        0.9874       0.9874             9.5940      1.0073            9.9000
 -296.2365    46