In [1]:
import os
import copy
import math
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})
# PyTorch
import torch
torch.manual_seed(42)
import torchvision
import torchmetrics
# Laplace
import laplace

In [2]:
import sys
sys.path.append('../src/')

%load_ext autoreload
%autoreload 2
# Importing our custom module(s)
import layers
import losses
import models
import utils

In [67]:
################################################################################
# Automatic second-order differentiation library (ASDL) supports Linear, 
# Conv2d, BatchNorm1d, BatchNorm2d, LayerNorm, and Embedding layers (see 
# https://github.com/kazukiosawa/asdl). We replace LayerNorm2d and CNBlock with 
# mathmetically equivalent implementations.
################################################################################

model1 = torchvision.models.convnext_tiny(weights=torchvision.models.ConvNeXt_Tiny_Weights.DEFAULT)
utils.replace_layernorm2d(model1)
utils.replace_cnblock(model1)

model2 = torchvision.models.convnext_tiny(weights=torchvision.models.ConvNeXt_Tiny_Weights.DEFAULT)

model1.eval()
model2.eval()

x = torch.randn(size=(1, 3, 224, 224))

with torch.no_grad():
    out1 = model1(x)
    out2 = model2(x)

print((out1 - out2).abs().sum().item())

0.0


In [68]:
backbone_prior_params = torch.load(f"/cluster/tufts/hugheslab/eharve06/convnext_tiny_torchvision/convnext_tiny_torchvision_mean.pt", map_location=torch.device("cpu"), weights_only=False)

params1 = utils.flatten_params(model1)
params2 = utils.flatten_params(model2)

print((params1 - params2).abs().sum().item())

print((params1[:len(backbone_prior_params)] - backbone_prior_params).abs().sum().item())
print((params2[:len(backbone_prior_params)] - backbone_prior_params).abs().sum().item())

0.0
0.0
0.0


In [9]:
def format_time(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours} hrs. {minutes} mins. {seconds} secs."

In [18]:
criterion = 'l2-zero'
dataset_directory = '/cluster/tufts/hugheslab/eharve06/CIFAR-10'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_CIFAR-10_diagEF'
lr_0s = [0.1, 0.01, 0.001, 0.0001]
ns = [100, 1000, 10000, 50000]
random_states = [1001, 2001, 3001]

columns = ['criterion', 'model_name', 'n', 'random_state', 'runtime', 'train_log_marglik', 'test_acc', 'test_nll']
retrained_df = pd.DataFrame(columns=columns)

for lr_0, n, random_state in itertools.product(lr_0s, ns, random_states):
    model_name = f'{criterion}_lr_0={lr_0}_n={n}_random_state={random_state}'
    if os.path.exists(f'{experiments_directory}/{model_name}.csv'):
        temp_df = pd.read_csv(f'{experiments_directory}/{model_name}.csv')
        row = [criterion, model_name, n, random_state, temp_df['train_sec/epoch'].sum(), temp_df.train_log_marglik.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        retrained_df.loc[len(retrained_df)] = row

In [109]:
min_indices = retrained_df.groupby(['criterion', 'n', 'random_state'])['train_log_marglik'].idxmax()
retrained_df = retrained_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,runtime,train_log_marglik,test_acc,test_nll
12,l2-zero,l2-zero_lr_0=0.01_n=100_random_state=1001,100,1001,7008.146201,-263.259644,0.6745,1.057087
13,l2-zero,l2-zero_lr_0=0.01_n=100_random_state=2001,100,2001,7000.924933,-242.931412,0.6659,1.05582
14,l2-zero,l2-zero_lr_0=0.01_n=100_random_state=3001,100,3001,6952.630825,-223.579773,0.6846,1.024537
15,l2-zero,l2-zero_lr_0=0.01_n=1000_random_state=1001,1000,1001,4952.520816,-404.404633,0.8877,0.442697
16,l2-zero,l2-zero_lr_0=0.01_n=1000_random_state=2001,1000,2001,4963.107343,-424.987091,0.8809,0.466042
17,l2-zero,l2-zero_lr_0=0.01_n=1000_random_state=3001,1000,3001,5867.403218,-388.481537,0.8779,0.480824
18,l2-zero,l2-zero_lr_0=0.01_n=10000_random_state=1001,10000,1001,4168.477561,-1875.979858,0.9509,0.203911
19,l2-zero,l2-zero_lr_0=0.01_n=10000_random_state=2001,10000,2001,4130.892845,-2439.328857,0.9506,0.211288
20,l2-zero,l2-zero_lr_0=0.01_n=10000_random_state=3001,10000,3001,4162.006493,-2173.709717,0.9517,0.19149
21,l2-zero,l2-zero_lr_0=0.01_n=50000_random_state=1001,50000,1001,4176.966407,-146591.953125,0.9696,0.116327


In [110]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,runtime,train_log_marglik,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-zero,100,"(l2-zero_lr_0=0.01_n=100_random_state=1001, l2...","(1001, 2001, 3001)","(7008.146201372147, 7000.924933433533, 6952.63...","(-263.2596435546875, -242.9314117431641, -223....","(0.6744999885559082, 0.6658999919891357, 0.684...","(1.0570869313240052, 1.0558203800201416, 1.024...",0.675,0.007642,0.6659,0.6846,1.045815,0.015054,1.024537,1.057087
1,l2-zero,1000,"(l2-zero_lr_0=0.01_n=1000_random_state=1001, l...","(1001, 2001, 3001)","(4952.520815849304, 4963.107343196869, 5867.40...","(-404.4046325683594, -424.9870910644531, -388....","(0.887700080871582, 0.8809000849723816, 0.8779...","(0.4426965748786928, 0.466041524219513, 0.4808...",0.882167,0.0041,0.8779,0.8877,0.463187,0.015696,0.442697,0.480824
2,l2-zero,10000,"(l2-zero_lr_0=0.01_n=10000_random_state=1001, ...","(1001, 2001, 3001)","(4168.477560520172, 4130.892844676971, 4162.00...","(-1875.9798583984373, -2439.328857421875, -217...","(0.9509000778198242, 0.95059996843338, 0.95170...","(0.2039114191532135, 0.21128786611557, 0.19148...",0.951067,0.000464,0.9506,0.9517,0.20223,0.00817,0.19149,0.211288
3,l2-zero,50000,"(l2-zero_lr_0=0.01_n=50000_random_state=1001, ...","(1001, 2001, 3001)","(4176.966406583786, 4114.107445716858, 4109.98...","(-146591.953125, -170956.5, -149952.875)","(0.9696000218391418, 0.9695000648498536, 0.969...","(0.1163268289327621, 0.1145831343889236, 0.115...",0.9694,0.000216,0.9691,0.9696,0.11534,0.00073,0.114583,0.116327


In [111]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-zero,100,0.675,0.6659,0.6846
1,l2-zero,1000,0.882167,0.8779,0.8877
2,l2-zero,10000,0.951067,0.9506,0.9517
3,l2-zero,50000,0.9694,0.9691,0.9696


In [112]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-zero,100,1.045815,1.024537,1.057087
1,l2-zero,1000,0.463187,0.442697,0.480824
2,l2-zero,10000,0.20223,0.19149,0.211288
3,l2-zero,50000,0.11534,0.114583,0.116327


In [16]:
criterion = 'l2-sp'
dataset_directory = '/cluster/tufts/hugheslab/eharve06/CIFAR-10'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_CIFAR-10_diagEF'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_CIFAR-10_ConvNeXt-Tiny_diagEF'
#experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_Flower-102_ConvNeXt-Tiny_diagEF'
#experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_Pet-37_ConvNeXt-Tiny_diagEF'
lr_0s = [0.1, 0.01, 0.001, 0.0001]
ns = [100, 1000, 10000, 50000]
#ns = [510, 1020]
#ns = [370, 3441]
random_states = [1001, 2001, 3001]

columns = ['criterion', 'model_name', 'n', 'random_state', 'runtime', 'train_lml', 'test_acc', 'test_nll']
retrained_df = pd.DataFrame(columns=columns)

for lr_0, n, random_state in itertools.product(lr_0s, ns, random_states):
    model_name = f'{criterion}_lr_0={lr_0}_n={n}_random_state={random_state}'
    if os.path.exists(f'{experiments_directory}/{model_name}.csv'):
        temp_df = pd.read_csv(f'{experiments_directory}/{model_name}.csv')
        row = [criterion, model_name, n, random_state, temp_df['train_sec/epoch'].sum(), temp_df.train_lml.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        #row = [criterion, model_name, n, random_state, temp_df['train_sec/epoch'].sum(), temp_df.train_log_marglik.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        retrained_df.loc[len(retrained_df)] = row

In [17]:
min_indices = retrained_df.groupby(['criterion', 'n', 'random_state'])['train_lml'].idxmax()
retrained_df = retrained_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,runtime,train_lml,test_acc,test_nll
0,l2-sp,l2-sp_lr_0=0.1_n=100_random_state=1001,100,1001,14134.493459,-138.64151,0.7981,1.089607
13,l2-sp,l2-sp_lr_0=0.01_n=100_random_state=2001,100,2001,14010.137941,-155.277374,0.8058,1.039174
14,l2-sp,l2-sp_lr_0=0.01_n=100_random_state=3001,100,3001,14105.137975,-153.154755,0.8129,0.935825
3,l2-sp,l2-sp_lr_0=0.1_n=1000_random_state=1001,1000,1001,14368.328732,-640.153687,0.889,0.396165
4,l2-sp,l2-sp_lr_0=0.1_n=1000_random_state=2001,1000,2001,14271.262175,-674.388794,0.8868,0.40401
17,l2-sp,l2-sp_lr_0=0.01_n=1000_random_state=3001,1000,3001,14376.422733,-686.632385,0.885,0.413887
6,l2-sp,l2-sp_lr_0=0.1_n=10000_random_state=1001,10000,1001,12942.582637,-4436.797852,0.9265,0.242917
7,l2-sp,l2-sp_lr_0=0.1_n=10000_random_state=2001,10000,2001,12896.968371,-4429.620117,0.9256,0.241054
8,l2-sp,l2-sp_lr_0=0.1_n=10000_random_state=3001,10000,3001,12819.669056,-4422.542969,0.9247,0.244053
9,l2-sp,l2-sp_lr_0=0.1_n=50000_random_state=1001,50000,1001,12717.618092,-15774.743164,0.95,0.157073


In [18]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,runtime,train_lml,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-sp,100,"(l2-sp_lr_0=0.1_n=100_random_state=1001, l2-sp...","(1001, 2001, 3001)","(14134.493458747864, 14010.137941360474, 14105...","(-138.64151000976562, -155.27737426757812, -15...","(0.7981, 0.8058, 0.8129000000000002)","(1.0896070217132563, 1.0391739906311035, 0.935...",0.8056,0.006044,0.7981,0.8129,1.021535,0.064008,0.935825,1.089607
1,l2-sp,1000,"(l2-sp_lr_0=0.1_n=1000_random_state=1001, l2-s...","(1001, 2001, 3001)","(14368.328731775284, 14271.262174844742, 14376...","(-640.1536865234375, -674.3887939453125, -686....","(0.8889999999999999, 0.8868, 0.885)","(0.3961651851654053, 0.4040104643344879, 0.413...",0.886933,0.001636,0.885,0.889,0.404688,0.007251,0.396165,0.413887
2,l2-sp,10000,"(l2-sp_lr_0=0.1_n=10000_random_state=1001, l2-...","(1001, 2001, 3001)","(12942.582637310028, 12896.968371152878, 12819...","(-4436.7978515625, -4429.6201171875, -4422.542...","(0.9265, 0.9256, 0.9247)","(0.2429165601730347, 0.2410544903755187, 0.244...",0.9256,0.000735,0.9247,0.9265,0.242675,0.001236,0.241054,0.244053
3,l2-sp,50000,"(l2-sp_lr_0=0.1_n=50000_random_state=1001, l2-...","(1001, 2001, 3001)","(12717.618091583252, 12878.859502315521, 12610...","(-15774.7431640625, -15740.7216796875, -15787....","(0.95, 0.9505, 0.9504)","(0.1570733209609985, 0.1559031601190567, 0.157...",0.9503,0.000216,0.95,0.9505,0.156839,0.000689,0.155903,0.15754


In [19]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-sp,100,0.8056,0.7981,0.8129
1,l2-sp,1000,0.886933,0.885,0.889
2,l2-sp,10000,0.9256,0.9247,0.9265
3,l2-sp,50000,0.9503,0.95,0.9505


In [20]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-sp,100,1.021535,0.935825,1.089607
1,l2-sp,1000,0.404688,0.396165,0.413887
2,l2-sp,10000,0.242675,0.241054,0.244053
3,l2-sp,50000,0.156839,0.155903,0.15754


In [None]:
condition = (retrained_df.n==50000)&(retrained_df.criterion=='l2-sp')&(retrained_df.random_state==1001)
temp_df = retrained_df.loc[condition]
total_time = temp_df.runtime.sum()
average_time = total_time/temp_df.shape[0]
print(f'Average runtime (sec.): {format_time(average_time)}')
print(f'Total time (sec.): {format_time(total_time)}')

In [11]:
min_indices = retrained_df.groupby(['criterion', 'n', 'random_state'])['train_lml'].idxmax()
retrained_df = retrained_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,runtime,train_log_marglik,test_acc,test_nll
12,l2-sp,l2-sp_lr_0=0.01_n=100_random_state=1001,100,1001,7771.870146,-230.25737,0.3405,2.302569
1,l2-sp,l2-sp_lr_0=0.1_n=100_random_state=2001,100,2001,7320.807987,-230.257629,0.3673,2.302573
14,l2-sp,l2-sp_lr_0=0.01_n=100_random_state=3001,100,3001,7323.553846,-230.256729,0.3423,2.30256
3,l2-sp,l2-sp_lr_0=0.1_n=1000_random_state=1001,1000,1001,5009.549274,-1781.31543,0.722,1.193988
16,l2-sp,l2-sp_lr_0=0.01_n=1000_random_state=2001,1000,2001,4991.712644,-1749.486816,0.7241,1.171732
17,l2-sp,l2-sp_lr_0=0.01_n=1000_random_state=3001,1000,3001,5644.311024,-1749.390137,0.7134,1.17217
18,l2-sp,l2-sp_lr_0=0.01_n=10000_random_state=1001,10000,1001,4249.52974,-11346.691406,0.7628,0.777652
19,l2-sp,l2-sp_lr_0=0.01_n=10000_random_state=2001,10000,2001,4185.410539,-11566.898438,0.765,0.781921
20,l2-sp,l2-sp_lr_0=0.01_n=10000_random_state=3001,10000,3001,4257.878783,-11260.216797,0.7645,0.78406
9,l2-sp,l2-sp_lr_0=0.1_n=50000_random_state=1001,50000,1001,4476.865883,-31109.111328,0.9372,0.186945


In [12]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,runtime,train_log_marglik,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-sp,100,"(l2-sp_lr_0=0.01_n=100_random_state=1001, l2-s...","(1001, 2001, 3001)","(7771.870146036148, 7320.807987213135, 7323.55...","(-230.2573699951172, -230.25762939453125, -230...","(0.3404999673366546, 0.3673000037670135, 0.342...","(2.302569054031372, 2.3025725662231444, 2.3025...",0.350033,0.012231,0.3405,0.3673,2.302567,5e-06,2.30256,2.302573
1,l2-sp,1000,"(l2-sp_lr_0=0.1_n=1000_random_state=1001, l2-s...","(1001, 2001, 3001)","(5009.5492742061615, 4991.712643861771, 5644.3...","(-1781.3154296875, -1749.48681640625, -1749.39...","(0.722000002861023, 0.7240999937057495, 0.7134...","(1.1939880359649662, 1.1717322761535645, 1.172...",0.719833,0.004629,0.7134,0.7241,1.179297,0.01039,1.171732,1.193988
2,l2-sp,10000,"(l2-sp_lr_0=0.01_n=10000_random_state=1001, l2...","(1001, 2001, 3001)","(4249.529740095139, 4185.4105389118195, 4257.8...","(-11346.69140625, -11566.8984375, -11260.21679...","(0.7628000378608704, 0.7649999856948853, 0.764...","(0.7776520441055297, 0.7819208739280702, 0.784...",0.7641,0.000942,0.7628,0.765,0.781211,0.002664,0.777652,0.78406
3,l2-sp,50000,"(l2-sp_lr_0=0.1_n=50000_random_state=1001, l2-...","(1001, 2001, 3001)","(4476.865883111954, 4307.140010595322, 4162.76...","(-31109.111328125, -29757.19921875, -30299.488...","(0.9372000694274902, 0.9431999921798706, 0.943...","(0.1869448745250701, 0.1738006152153015, 0.174...",0.9414,0.00298,0.9372,0.9438,0.178508,0.005979,0.173801,0.186945


In [13]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-sp,100,0.350033,0.3405,0.3673
1,l2-sp,1000,0.719833,0.7134,0.7241
2,l2-sp,10000,0.7641,0.7628,0.765
3,l2-sp,50000,0.9414,0.9372,0.9438


In [14]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-sp,100,2.302567,2.30256,2.302573
1,l2-sp,1000,1.179297,1.171732,1.193988
2,l2-sp,10000,0.781211,0.777652,0.78406
3,l2-sp,50000,0.178508,0.173801,0.186945


In [15]:
criterion = 'l2-zero'
dataset_directory = '/cluster/tufts/hugheslab/eharve06/Oxford-IIIT_Pet'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_Oxford-IIIT_Pet_diagEF'
lr_0s = [0.1, 0.01, 0.001, 0.0001]
ns = [370, 3441]
random_states = [1001, 2001, 3001]

columns = ['criterion', 'model_name', 'n', 'random_state', 'train_log_marglik', 'test_acc', 'test_nll']
retrained_df = pd.DataFrame(columns=columns)

for lr_0, n, random_state in itertools.product(lr_0s, ns, random_states):
    model_name = f'{criterion}_lr_0={lr_0}_n={n}_random_state={random_state}'
    if os.path.exists(f'{experiments_directory}/{model_name}.csv'):
        temp_df = pd.read_csv(f'{experiments_directory}/{model_name}.csv')
        row = [criterion, model_name, n, random_state, temp_df.train_log_marglik.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        retrained_df.loc[len(retrained_df)] = row
    
filtered_df = retrained_df[retrained_df['train_log_marglik'].notna()]
min_indices = filtered_df.groupby(['criterion', 'n', 'random_state'])['train_log_marglik'].idxmax()
retrained_df = filtered_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,train_log_marglik,test_acc,test_nll
6,l2-zero,l2-zero_lr_0=0.01_n=370_random_state=1001,370,1001,-459.612122,0.884692,0.409536
7,l2-zero,l2-zero_lr_0=0.01_n=370_random_state=2001,370,2001,-460.31366,0.878255,0.420677
8,l2-zero,l2-zero_lr_0=0.01_n=370_random_state=3001,370,3001,-475.961823,0.881983,0.369925
9,l2-zero,l2-zero_lr_0=0.01_n=3441_random_state=1001,3441,1001,-1272.866211,0.932886,0.282283
10,l2-zero,l2-zero_lr_0=0.01_n=3441_random_state=2001,3441,2001,-1324.654175,0.93152,0.291258
11,l2-zero,l2-zero_lr_0=0.01_n=3441_random_state=3001,3441,3001,-1320.142212,0.930102,0.285095


In [16]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,train_log_marglik,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-zero,370,"(l2-zero_lr_0=0.01_n=370_random_state=1001, l2...","(1001, 2001, 3001)","(-459.6121215820313, -460.3136596679688, -475....","(0.8846922516822815, 0.8782547116279602, 0.881...","(0.4095357748509493, 0.420676869790799, 0.3699...",0.881643,0.002639,0.878255,0.884692,0.400046,0.021779,0.369925,0.420677
1,l2-zero,3441,"(l2-zero_lr_0=0.01_n=3441_random_state=1001, l...","(1001, 2001, 3001)","(-1272.8662109375, -1324.6541748046875, -1320....","(0.9328855276107788, 0.9315203428268432, 0.930...","(0.282282579809013, 0.291258049339389, 0.28509...",0.931503,0.001136,0.930102,0.932886,0.286212,0.003748,0.282283,0.291258


In [17]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-zero,370,0.881643,0.878255,0.884692
1,l2-zero,3441,0.931503,0.930102,0.932886


In [18]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-zero,370,0.400046,0.369925,0.420677
1,l2-zero,3441,0.286212,0.282283,0.291258


In [19]:
criterion = 'l2-sp'
dataset_directory = '/cluster/tufts/hugheslab/eharve06/Oxford-IIIT_Pet'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_Oxford-IIIT_Pet_diagEF'
lr_0s = [0.1, 0.01, 0.001, 0.0001]
ns = [370, 3441]
random_states = [1001, 2001, 3001]

columns = ['criterion', 'model_name', 'n', 'random_state', 'train_log_marglik', 'test_acc', 'test_nll']
retrained_df = pd.DataFrame(columns=columns)

for lr_0, n, random_state in itertools.product(lr_0s, ns, random_states):
    model_name = f'{criterion}_lr_0={lr_0}_n={n}_random_state={random_state}'
    if os.path.exists(f'{experiments_directory}/{model_name}.csv'):
        temp_df = pd.read_csv(f'{experiments_directory}/{model_name}.csv')
        row = [criterion, model_name, n, random_state, temp_df.train_log_marglik.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        retrained_df.loc[len(retrained_df)] = row
    
filtered_df = retrained_df[retrained_df['train_log_marglik'].notna()]
min_indices = filtered_df.groupby(['criterion', 'n', 'random_state'])['train_log_marglik'].idxmax()
retrained_df = filtered_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,train_log_marglik,test_acc,test_nll
6,l2-sp,l2-sp_lr_0=0.01_n=370_random_state=1001,370,1001,-929.215149,0.864195,1.353605
1,l2-sp,l2-sp_lr_0=0.1_n=370_random_state=2001,370,2001,-915.322754,0.863962,1.284406
2,l2-sp,l2-sp_lr_0=0.1_n=370_random_state=3001,370,3001,-917.240967,0.872751,1.369776
3,l2-sp,l2-sp_lr_0=0.1_n=3441_random_state=1001,3441,1001,-2850.339844,0.906682,0.411867
4,l2-sp,l2-sp_lr_0=0.1_n=3441_random_state=2001,3441,2001,-2810.989258,0.906163,0.411842
5,l2-sp,l2-sp_lr_0=0.1_n=3441_random_state=3001,3441,3001,-2839.485107,0.905492,0.411417


In [20]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,train_log_marglik,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-sp,370,"(l2-sp_lr_0=0.01_n=370_random_state=1001, l2-s...","(1001, 2001, 3001)","(-929.2151489257812, -915.32275390625, -917.24...","(0.8641949892044067, 0.8639618158340454, 0.872...","(1.35360466836096, 1.2844058398688385, 1.36977...",0.866969,0.004089,0.863962,0.872751,1.335929,0.037026,1.284406,1.369776
1,l2-sp,3441,"(l2-sp_lr_0=0.1_n=3441_random_state=1001, l2-s...","(1001, 2001, 3001)","(-2850.33984375, -2810.9892578125, -2839.48510...","(0.9066816568374634, 0.906163454055786, 0.9054...","(0.4118665885733595, 0.4118419586150059, 0.411...",0.906112,0.000487,0.905492,0.906682,0.411708,0.000206,0.411417,0.411867


In [21]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-sp,370,0.866969,0.863962,0.872751
1,l2-sp,3441,0.906112,0.905492,0.906682


In [22]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-sp,370,1.335929,1.284406,1.369776
1,l2-sp,3441,0.411708,0.411417,0.411867


In [23]:
criterion = 'l2-zero'
dataset_directory = '/cluster/tufts/hugheslab/eharve06/Flowers_102'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_Flowers_102_diagEF'
lr_0s = [0.1, 0.01, 0.001, 0.0001]
ns = [510, 1020]
random_states = [1001, 2001, 3001]

columns = ['criterion', 'model_name', 'n', 'random_state', 'train_log_marglik', 'test_acc', 'test_nll']
retrained_df = pd.DataFrame(columns=columns)

for lr_0, n, random_state in itertools.product(lr_0s, ns, random_states):
    model_name = f'{criterion}_lr_0={lr_0}_n={n}_random_state={random_state}'
    if os.path.exists(f'{experiments_directory}/{model_name}.csv'):
        temp_df = pd.read_csv(f'{experiments_directory}/{model_name}.csv')
        row = [criterion, model_name, n, random_state, temp_df.train_log_marglik.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        retrained_df.loc[len(retrained_df)] = row
        
filtered_df = retrained_df[retrained_df['train_log_marglik'].notna()]
min_indices = filtered_df.groupby(['criterion', 'n', 'random_state'])['train_log_marglik'].idxmax()
retrained_df = filtered_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,train_log_marglik,test_acc,test_nll
6,l2-zero,l2-zero_lr_0=0.01_n=510_random_state=1001,510,1001,-784.824951,0.84659,0.730909
7,l2-zero,l2-zero_lr_0=0.01_n=510_random_state=2001,510,2001,-779.792603,0.85345,0.686262
8,l2-zero,l2-zero_lr_0=0.01_n=510_random_state=3001,510,3001,-796.898254,0.858159,0.700889
9,l2-zero,l2-zero_lr_0=0.01_n=1020_random_state=1001,1020,1001,-957.297668,0.926593,0.367216
10,l2-zero,l2-zero_lr_0=0.01_n=1020_random_state=2001,1020,2001,-970.836426,0.923874,0.354911
11,l2-zero,l2-zero_lr_0=0.01_n=1020_random_state=3001,1020,3001,-960.152344,0.924812,0.363436


In [24]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,train_log_marglik,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-zero,510,"(l2-zero_lr_0=0.01_n=510_random_state=1001, l2...","(1001, 2001, 3001)","(-784.824951171875, -779.7926025390625, -796.8...","(0.8465895652770996, 0.853449821472168, 0.8581...","(0.730909150323709, 0.6862618204261965, 0.7008...",0.852733,0.00475,0.84659,0.858159,0.70602,0.018585,0.686262,0.730909
1,l2-zero,1020,"(l2-zero_lr_0=0.01_n=1020_random_state=1001, l...","(1001, 2001, 3001)","(-957.2976684570312, -970.83642578125, -960.15...","(0.9265932440757751, 0.9238739013671876, 0.924...","(0.3672155920624524, 0.3549106962886675, 0.363...",0.925093,0.001128,0.923874,0.926593,0.361854,0.005146,0.354911,0.367216


In [25]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-zero,510,0.852733,0.84659,0.858159
1,l2-zero,1020,0.925093,0.923874,0.926593


In [26]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-zero,510,0.70602,0.686262,0.730909
1,l2-zero,1020,0.361854,0.354911,0.367216


In [27]:
criterion = 'l2-sp'
dataset_directory = '/cluster/tufts/hugheslab/eharve06/Flowers_102'
experiments_directory = '/cluster/tufts/hugheslab/eharve06/data-emphasized-ELBo/experiments/retrained_Flowers_102_diagEF'
lr_0s = [0.1, 0.01, 0.001, 0.0001]
ns = [510, 1020]
random_states = [1001, 2001, 3001]

columns = ['criterion', 'model_name', 'n', 'random_state', 'train_log_marglik', 'test_acc', 'test_nll']
retrained_df = pd.DataFrame(columns=columns)

for lr_0, n, random_state in itertools.product(lr_0s, ns, random_states):
    model_name = f'{criterion}_lr_0={lr_0}_n={n}_random_state={random_state}'
    if os.path.exists(f'{experiments_directory}/{model_name}.csv'):
        temp_df = pd.read_csv(f'{experiments_directory}/{model_name}.csv')
        row = [criterion, model_name, n, random_state, temp_df.train_log_marglik.values[-1], temp_df.val_or_test_acc.values[-1], temp_df.val_or_test_nll.values[-1]]
        retrained_df.loc[len(retrained_df)] = row
        
filtered_df = retrained_df[retrained_df['train_log_marglik'].notna()]
min_indices = filtered_df.groupby(['criterion', 'n', 'random_state'])['train_log_marglik'].idxmax()
retrained_df = filtered_df.loc[min_indices]

retrained_df

Unnamed: 0,criterion,model_name,n,random_state,train_log_marglik,test_acc,test_nll
0,l2-sp,l2-sp_lr_0=0.1_n=510_random_state=1001,510,1001,-2358.463867,0.296046,4.624085
1,l2-sp,l2-sp_lr_0=0.1_n=510_random_state=2001,510,2001,-2358.49585,0.203985,4.624209
2,l2-sp,l2-sp_lr_0=0.1_n=510_random_state=3001,510,3001,-2358.385254,0.187693,4.623873
9,l2-sp,l2-sp_lr_0=0.01_n=1020_random_state=1001,1020,1001,-4229.246582,0.82662,2.602177
10,l2-sp,l2-sp_lr_0=0.01_n=1020_random_state=2001,1020,2001,-4264.004883,0.825626,2.599816
11,l2-sp,l2-sp_lr_0=0.01_n=1020_random_state=3001,1020,3001,-4304.702148,0.827271,2.610205


In [28]:
grouped_df = retrained_df.groupby(['criterion', 'n']).agg(lambda x: tuple(x))
columns = ['test_acc', 'test_nll']
for column in columns:
    grouped_df[f'{column}_mean'] = grouped_df[column].apply(lambda item: np.mean(item))
    grouped_df[f'{column}_std'] = grouped_df[column].apply(lambda item: np.std(item))
    grouped_df[f'{column}_min'] = grouped_df[column].apply(lambda item: np.min(item))
    grouped_df[f'{column}_max'] = grouped_df[column].apply(lambda item: np.max(item))
grouped_df = grouped_df.reset_index()
grouped_df

Unnamed: 0,criterion,n,model_name,random_state,train_log_marglik,test_acc,test_nll,test_acc_mean,test_acc_std,test_acc_min,test_acc_max,test_nll_mean,test_nll_std,test_nll_min,test_nll_max
0,l2-sp,510,"(l2-sp_lr_0=0.1_n=510_random_state=1001, l2-sp...","(1001, 2001, 3001)","(-2358.4638671875, -2358.495849609375, -2358.3...","(0.2960460782051086, 0.2039852440357208, 0.187...","(4.624085110481203, 4.624208929403407, 4.62387...",0.229241,0.047704,0.187693,0.296046,4.624056,0.000139,4.623873,4.624209
1,l2-sp,1020,"(l2-sp_lr_0=0.01_n=1020_random_state=1001, l2-...","(1001, 2001, 3001)","(-4229.24658203125, -4264.0048828125, -4304.70...","(0.8266200423240662, 0.8256259560585022, 0.827...","(2.602176932750397, 2.599815851441242, 2.61020...",0.826506,0.000677,0.825626,0.827271,2.604066,0.004447,2.599816,2.610205


In [29]:
grouped_df[['criterion', 'n', 'test_acc_mean', 'test_acc_min', 'test_acc_max']]

Unnamed: 0,criterion,n,test_acc_mean,test_acc_min,test_acc_max
0,l2-sp,510,0.229241,0.187693,0.296046
1,l2-sp,1020,0.826506,0.825626,0.827271


In [30]:
grouped_df[['criterion', 'n', 'test_nll_mean', 'test_nll_min', 'test_nll_max']]

Unnamed: 0,criterion,n,test_nll_mean,test_nll_min,test_nll_max
0,l2-sp,510,4.624056,4.623873,4.624209
1,l2-sp,1020,2.604066,2.599816,2.610205


In [3]:
dataset_directory = '/cluster/tufts/hugheslab/eharve06/CIFAR-10'
n = 1_000
tune = False
random_state = 1001
augmented_train_dataset, train_dataset, val_or_test_dataset = utils.get_cifar10_datasets(dataset_directory, n, tune, random_state)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
batch_size = 128
num_workers = 0
augmented_train_loader = torch.utils.data.DataLoader(augmented_train_dataset, batch_size=min(batch_size, len(augmented_train_dataset)), shuffle=True, num_workers=num_workers, drop_last=True)
train_loader = torch.utils.data.DataLoader(augmented_train_dataset, batch_size=min(batch_size, len(augmented_train_dataset)), shuffle=True, num_workers=num_workers)
#train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=min(batch_size, len(train_dataset)), num_workers=num_workers)
val_or_test_loader = torch.utils.data.DataLoader(val_or_test_dataset, batch_size=batch_size, num_workers=num_workers)

In [8]:
steps = 6000
num_batches = len(augmented_train_loader)
epochs = int(steps/num_batches)
epochs

857

In [9]:
num_classes = 10
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

model = torchvision.models.resnet50()
model.fc = torch.nn.Linear(in_features=2048, out_features=num_classes, bias=True)
model.to(device)

bb_loc = torch.load('/cluster/tufts/hugheslab/eharve06/resnet50_torchvision/resnet50_torchvision_mean.pt', map_location=torch.device('cpu'), weights_only=False).to(device)
clf_loc = torch.ones((2048 * num_classes) + num_classes).to(device)

cuda:0


In [11]:
num_classes = 10
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

model = torchvision.models.vit_b_16()
model.heads.head = torch.nn.Linear(in_features=768, out_features=num_classes, bias=True)
model.to(device)

bb_loc = torch.load('/cluster/tufts/hugheslab/eharve06/vit_b_16_torchvision/vit_b_16_torchvision_mean.pt', map_location=torch.device('cpu'), weights_only=False).to(device )
clf_loc = torch.ones((768 * num_classes) + num_classes).to(device)

cuda:0


In [9]:
num_classes = 10
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

model = torchvision.models.convnext_tiny()
model.classifier[2] = torch.nn.Linear(in_features=768, out_features=num_classes, bias=True)
model.to(device)

bb_loc = torch.load('/cluster/tufts/hugheslab/eharve06/convnext_tiny_torchvision/convnext_tiny_torchvision_mean.pt', map_location=torch.device('cpu'), weights_only=False).to(device)
clf_loc = torch.ones((768 * num_classes) + num_classes).to(device)

cuda:0


In [3]:
from laplace.baselaplace import DiagLaplace

class L2SPLaplace(DiagLaplace):
    
    @property
    def prior_precision(self) -> torch.Tensor:
        return self._prior_precision

    @prior_precision.setter
    def prior_precision(self, prior_precision: float | torch.Tensor):
        self._posterior_scale = None
        self._prior_precision = prior_precision.to(
            device=self._device, dtype=self._dtype
        )
        
    @property
    def prior_precision_diag(self) -> torch.Tensor:
        alpha = self.prior_precision[0]
        beta = self.prior_precision[1]
        return torch.cat([alpha * torch.ones(self.D, device=self._device, dtype=self._dtype),
                          beta * torch.ones(self.n_params - self.D, device=self._device, dtype=self._dtype)])

prior_precision = torch.tensor([1.0, 1.0]).to(device)
prior_mean = torch.cat((bb_loc, clf_loc))
la = L2SPLaplace(
    model=model,
    likelihood='classification',
    prior_precision=prior_precision,
    prior_mean=prior_mean,
    backend=AsdlEF,
)
la.D = len(bb_loc)
alpha = la.prior_precision[0].item() / len(train_dataset)
beta = la.prior_precision[1].item() / len(train_dataset)

steps = 6000
num_batches = len(augmented_train_loader)
epochs = int(steps/num_batches)

epoch = 0

F, K, gamma = 10, 100, 1
if epoch % F == 0:
    
    print(la.prior_precision)
    la.fit(train_loader)
    la.optimize_prior_precision(
        pred_type='glm', 
        method='marglik', 
        n_steps=K, 
        lr=gamma, 
        init_prior_prec=la.prior_precision
    )
    alpha = la.prior_precision[0].item() / len(train_dataset)
    beta = la.prior_precision[1].item() / len(train_dataset)
    print(la.prior_precision)

NameError: name 'model' is not defined

In [10]:
from sklearn.model_selection import train_test_split
# Hugging Face
import datasets
import transformers

def get_news_ag_datasets(dataset_directory, n, tune, random_state):

    ag_news_dataset = datasets.load_dataset('ag_news', cache_dir=dataset_directory)
    full_train_dataset = pd.DataFrame(ag_news_dataset['train'])
    full_test_dataset = pd.DataFrame(ag_news_dataset['test'])

    if n == len(full_train_dataset):
        train_and_val_indices = np.arange(0, len(full_train_dataset))
    else:
        train_and_val_indices, _ = train_test_split(
            np.arange(0, len(full_train_dataset)), 
            test_size=None, 
            train_size=n, 
            random_state=random_state, 
            shuffle=True, 
            stratify=np.array(full_train_dataset.label),
        )
        
    val_size = int((1/5) * n)
    train_indices, val_indices = train_test_split(
        train_and_val_indices, 
        test_size=val_size, 
        train_size=n-val_size, 
        random_state=random_state, 
        shuffle=True, 
        stratify=np.array(full_train_dataset.label)[train_and_val_indices],
    )
    
    if tune:
        return datasets.DatasetDict({
            'train': datasets.Dataset.from_pandas(full_train_dataset.iloc[train_indices]),
            'val_or_test': datasets.Dataset.from_pandas(full_train_dataset.iloc[val_indices])
        })
    else:
        return datasets.DatasetDict({
            'train': datasets.Dataset.from_pandas(full_train_dataset.iloc[train_and_val_indices]),
            'val_or_test': datasets.Dataset.from_pandas(full_test_dataset)
        })


In [20]:
dataset_directory = '/cluster/tufts/hugheslab/eharve06/AG_News'
n = 400
tune = False
random_state = 1001
dataset = get_news_ag_datasets(dataset_directory, n, tune, random_state)

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets['train'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_datasets['val_or_test'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

batch_size = 32
augmented_train_loader = torch.utils.data.DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True)
train_loader = torch.utils.data.DataLoader(tokenized_datasets['train'], batch_size=batch_size)
val_or_test_loader = torch.utils.data.DataLoader(tokenized_datasets['val_or_test'], batch_size=batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Map: 100%|██████████| 400/400 [00:00<00:00, 1645.37 examples/s]
Map: 100%|██████████| 7600/7600 [00:04<00:00, 1692.32 examples/s]

cuda





In [30]:
import transformers
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.classifier

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=4, bias=True)

In [18]:
from torchvision.ops.stochastic_depth import StochasticDepth
from torchvision.ops.misc import Permute


In [21]:
import torch
import torch.nn as nn
import typing

class MyBERT(torch.nn.Module):
    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, num_labels: int):
        super().__init__()
        config = transformers.BertConfig.from_pretrained("bert-base-uncased")
        config.pad_token_id = tokenizer.pad_token_id
        config.num_labels = num_labels
        self.hf_model = transformers.BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", config=config
        )

    def forward(self, data: typing.MutableMapping) -> torch.Tensor:
        device = next(self.parameters()).device
        input_ids = data["input_ids"].to(device)
        attn_mask = data["attention_mask"].to(device)
        output_dict = self.hf_model(input_ids=input_ids, attention_mask=attn_mask)
        return output_dict.logits
    
num_classes = 4
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = MyBERT(tokenizer, num_classes)
    
la = DiagLaplace(
    model=model,
    likelihood='classification',
    prior_precision=1.0,
    backend=AsdlEF,
)

epoch = 0

F, K, gamma = 10, 100, 1
if epoch % F == 0:
    
    print(la.prior_precision)
    la.fit(train_loader)
    la.optimize_prior_precision(
        pred_type='glm', 
        method='marglik', 
        n_steps=K, 
        lr=gamma, 
        init_prior_prec=la.prior_precision
    )
    print(la.prior_precision)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([1.])


  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


tensor([0.3453])


In [8]:
from laplace.curvature import AsdlEF

num_classes = 10

model = torchvision.models.convnext_tiny()

utils.replace_layernorm2d(model)
for stage_idx, stage in enumerate(model.features):
    for block_idx, block in enumerate(stage):
        if isinstance(block, torchvision.models.convnext.CNBlock):
            model.features[stage_idx][block_idx] = layers.ModifiedCNBlock(
                dim=len(block.layer_scale.view(-1)),
                layer_scale=block.layer_scale.view(-1),
                stochastic_depth_prob=block.stochastic_depth.p
            )

model.classifier[2] = torch.nn.Linear(in_features=768, out_features=num_classes, bias=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

backbone_prior_params = torch.load("/cluster/tufts/hugheslab/eharve06/convnext_tiny_torchvision/convnext_tiny_torchvision_mean.pt", map_location=device, weights_only=False)
classifier_prior_params = torch.zeros((768 * num_classes) + num_classes, device=device)

prior_precision = torch.tensor([1.0, 1.0], device=device)
prior_mean = torch.cat((backbone_prior_params, classifier_prior_params))
la = models.L2SPLaplace(
    model=model,
    likelihood='classification',
    prior_precision=prior_precision,
    prior_mean=prior_mean,
    backend=laplace.curvature.AsdlEF,
)
la.num_backbone_params = len(backbone_prior_params)

epoch = 0

F, K, gamma = 10, 100, 1
if epoch % F == 0:
    
    print(la.prior_precision)
    la.fit(train_loader)
    la.optimize_prior_precision(
        pred_type='glm', 
        method='marglik', 
        n_steps=K, 
        lr=gamma, 
        init_prior_prec=la.prior_precision
    )
    print(la.prior_precision)

cuda:0
tensor([1., 1.], device='cuda:0')
tensor([  1.6434, 405.6746], device='cuda:0')


In [17]:
import torchvision
from asdl.precondition import PreconditioningConfig, KfacGradientMaker

model = torchvision.models.resnet50()
#model = torchvision.models.vit_b_16()
model = torchvision.models.convnext_tiny()
config = PreconditioningConfig(data_size=128, damping=0.01)
gm = KfacGradientMaker(model, config, fisher_type="fisher_emp")


KeyboardInterrupt



In [15]:
import torch
import torch.nn as nn
import torchvision

class Permute(nn.Module):
    def __init__(self, dims):
        super().__init__()
        self.dims = dims

    def forward(self, x):
        return x.permute(*self.dims)

def replace_layernorm2d(module):
    for name, child in list(module.named_children()):
        if isinstance(child, torchvision.models.convnext.LayerNorm2d):
            setattr(module, name, nn.Sequential(
                Permute([0, 2, 3, 1]),
                nn.LayerNorm(child.normalized_shape),
                Permute([0, 3, 1, 2]),
            ))
        else:
            replace_layernorm2d(child)

model = torchvision.models.convnext_tiny()
replace_layernorm2d(model)

In [17]:
x = torch.randn(1, 3, 224, 224)
model(x)

tensor([[-4.0289e-01,  3.1997e-01,  4.1087e-01,  4.2722e-01,  1.3026e+00,
         -9.0285e-01,  5.4405e-01,  1.5292e-01, -9.5151e-02,  3.8550e-01,
          4.6962e-01, -3.4777e-01,  1.7104e-01,  9.1534e-02,  1.0586e+00,
          2.9093e-01, -1.5888e-01,  1.7146e-01,  5.8896e-01, -1.1887e+00,
         -4.6582e-01,  8.2971e-02,  5.2676e-01,  1.7226e-01,  3.3745e-02,
          1.0056e+00, -3.9954e-01, -1.4819e-01, -5.0637e-01,  1.0192e+00,
          6.9969e-01, -3.0734e-02,  1.0523e+00,  1.1852e-01, -7.7666e-01,
          6.6544e-01, -1.8420e-01,  8.5591e-01, -5.7208e-01,  6.3850e-01,
         -7.2485e-01, -8.3990e-01, -5.7616e-01,  3.8271e-01,  3.2617e-01,
         -3.7054e-03,  1.0169e-01, -1.9292e-01, -7.7529e-02, -5.1761e-01,
         -8.8081e-02, -7.7610e-02, -5.4441e-01, -4.6589e-01, -6.3042e-01,
         -8.2017e-01,  1.9869e-01, -8.3109e-02,  2.8478e-01, -8.9875e-02,
         -1.7726e-01, -2.0488e-01, -4.2953e-01,  3.4150e-01,  4.6771e-01,
         -2.2265e-01,  8.6803e-01,  9.

In [5]:
model = torchvision.models.resnet50()
#model = torchvision.models.vit_b_16()
model = torchvision.models.convnext_tiny()

supported_layers = ['Linear', 'Conv2d', 'BatchNorm1d', 'BatchNorm2d', 'LayerNorm', 'Embedding']

for module in model.modules():
    module_requires_grad = any(p.requires_grad for p in module.parameters(recurse=False))
    module_name = str(module).partition('(')[0]
    if module_requires_grad and (module_name not in supported_layers):
        print(module_name)
        print([name for name, param in module.named_parameters(recurse=False)])

LayerNorm2d
['weight', 'bias']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
LayerNorm2d
['weight', 'bias']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
LayerNorm2d
['weight', 'bias']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
LayerNorm2d
['weight', 'bias']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
CNBlock
['layer_scale']
LayerNorm2d
['weight', 'bias']
