In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
os.getcwd()


'/mfs1/u/viet/bayesian_dpddm'

In [4]:
os.environ['HYDRA_FULL_ERROR'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import wandb
import hydra
import pandas as pd
from omegaconf import DictConfig, OmegaConf
from tqdm import tqdm
import sys
from hydra import compose, initialize

import fcntl

from bayesian_dpddm.monitors import DPDDMBayesianMonitor, DPDDMFullInformationMonitor, DPDDMBERTMonitor
from bayesian_dpddm.models import ConvModel, MLPModel, ResNetModel, BERTModel

import torch
import torch.nn as nn
#import torch.multiprocessing as mp
import numpy as np
from experiments.utils import get_datasets, get_configs
from pprint import pprint

torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# base models
base_models = {
    'cifar10': ConvModel,
    'uci': MLPModel,
    'synthetic': MLPModel,
    'camelyon17': ResNetModel,
    'civilcomments': BERTModel,
}


monitors = {
    'bayesian': DPDDMBayesianMonitor,
    'fi': DPDDMFullInformationMonitor,
    'bert': DPDDMBERTMonitor,
}

In [136]:
config_path = 'configs/'
config_name = 'cifar10_best_'
with initialize(config_path=config_path, version_base='1.2'):
    args = compose(config_name=config_name)

# cfg is now a DictConfig
pprint(args)
args.dpddm.data_sample_size = 100

{'train': {'disagreement_epochs': 5, 'disagreement_optimizer': 'torch.optim.AdamW', 'disagreement_wd': 0.0001, 'disagreement_lr': 0.01, 'disagreement_batch_size': 64, 'disagreement_alpha': 0.8, 'num_epochs': 10, 'batch_size': 64, 'lr': 0.001, 'wd': 0.0001, 'optimizer': 'torch.optim.AdamW', 'clip_val': 1, 'val_freq': 1, 'num_workers': 4, 'pin_memory': True}, 'dataset': {'name': 'cifar10', 'num_classes': 10, 'data_dir': 'data/cifar10_data'}, 'dpddm': {'Phi_size': 1000, 'n_post_samples': 5000, 'data_sample_size': 10, 'temp': 1, 'n_repeats': 100}, 'model': {'name': 'conv_model', 'in_channels': 3, 'mid_channels': 128, 'kernel_size': 7, 'mid_layers': 3, 'pool_dims': 2, 'hidden_dim': 256, 'dropout': 0.0, 'reg_weight_factor': 10, 'param': 'diagonal', 'prior_scale': 1.0, 'wishart_scale': 1.0, 'return_ood': False}, 'wandb_cfg': {'project': 'bayesian_dpddm', 'entity': 'viet', 'job_type': 'train', 'log_artifacts': True}, 'from_pretrained': False, 'seed': 57, 'monitor_type': 'bayesian', 'self_log':

In [137]:
dataset = get_datasets(args)
model_config, train_config = get_configs(args)
trainloader = torch.utils.data.DataLoader(
    dataset['train'],
    batch_size=train_config.batch_size,
    shuffle=True,
    num_workers=train_config.num_workers,
    #pin_memory=train_config.pin_memory,
    #persistent_workers=True,
)

valloader = torch.utils.data.DataLoader(
    dataset['valid'],
    batch_size=train_config.batch_size,
    shuffle=False,
    num_workers=train_config.num_workers,
    #pin_memory=train_config.pin_memory,
    #persistent_workers=True,
)

oodloader = torch.utils.data.DataLoader(
    dataset['dpddm_ood'],
    batch_size = train_config.batch_size,
    shuffle=False,
    num_workers=train_config.num_workers,
    #pin_memory=True, 
    #persistent_workers=True
)

In [138]:
''' Build model and monitor '''
base_model = base_models[args.dataset.name](model_config, train_size=len(dataset['train']))
monitor = monitors[args.monitor_type](
    model=base_model,
    trainset=dataset['train'],
    valset=dataset['valid'],
    train_cfg=train_config,
    device=device,
)

In [139]:
output_metrics = monitor.train_model(tqdm_enabled=True)

  0%|          | 0/10 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 10%|█         | 1/10 [00:12<01:51, 12.40s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 20%|██        | 2/10 [00:20<01:20, 10.01s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [140]:
for keys in ['train_acc', 'val_acc']:
    print(keys)
    print(monitor.output_metrics[keys][-1])

train_acc
0.698
val_acc
0.6938694267515924


In [10]:
args.dpddm.data_sample_size = 100
monitor.Phi = []
monitor.pretrain_disagreement_distribution(dataset=dataset['dpddm_train'],
                                        n_post_samples=args.dpddm.n_post_samples,
                                        data_sample_size=args.dpddm.data_sample_size,
                                        Phi_size=args.dpddm.Phi_size, 
                                        temperature=args.dpddm.temp,
                                        )

100%|██████████| 1000/1000 [00:02<00:00, 336.78it/s]


In [142]:
L_train, L_dpddm = len(dataset['train']), len(dataset['dpddm_train'])
L_train, L_dpddm

(40000, 10000)

In [19]:
from bayesian_dpddm.monitors.utils import sample_from_dataset
X, _ = sample_from_dataset(
    n_samples=args.dpddm.data_sample_size,
    dataset=dataset['dpddm_train'],
    replace=True
)
y_pseudo = monitor.get_pseudolabels(X).cpu()
X = X.cpu()

In [143]:
class JointDataset(torch.utils.data.Dataset):
    ''' Temporary dataset to batch for data that does not fit on GPU '''
    def __init__(self, X, y, train_size=0):
        self.X = X
        self.y = y
        self.train_size = train_size
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]  

In [21]:
X_cat = torch.concatenate([dataset['train'].X, X], dim=0)
y_cat = torch.concatenate([dataset['train'].y, y_pseudo], dim=0)

In [25]:
catset = JointDataset(X_cat, y_cat, train_size=len(dataset['train']))

In [27]:
X_cat, y_cat = X_cat.cuda(), y_cat.cuda()
monitor.model.eval()

MLPModel(
  (init_fc): Linear(in_features=9, out_features=16, bias=True)
  (mid_fc): ModuleList(
    (0-3): 4 x Linear(in_features=16, out_features=16, bias=True)
  )
  (out_layer): DiscClassification()
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
def compute_acc_train(monitor, trainset, d3mset, n_post_samples=5000):
    model = monitor.model
    X, _ = sample_from_dataset(
        n_samples=args.dpddm.data_sample_size,
        dataset=d3mset,
        replace=True
    )
    y_pseudo = monitor.get_pseudolabels(X).cpu()
    X = X.cpu()
    X_cat = torch.concatenate([trainset.dataset.data, X], dim=0)
    y_cat = torch.concatenate([trainset.dataset.targets, y_pseudo], dim=0)
    catset = JointDataset(X, y_pseudo, train_size=len(trainset))
    catset = torch.utils.data.ConcatDataset([dataset['train'].dataset, catset])
    catloader = torch.utils.data.DataLoader(catset, batch_size=32, shuffle=False)
    model.eval()
    all_preds = [] 
    all_labels = []
    with torch.no_grad():
        for features, labels in catloader:
            features, labels = features.cuda(), labels.cuda()
            out = monitor.model.get_features(features)
            ll_dist = monitor.model.out_layer.logit_predictive(out)
            logits_samples = ll_dist.rsample(sample_shape=torch.Size([n_post_samples]))
            logits_samples = logits_samples / 1
            dist = torch.distributions.Categorical(logits=logits_samples)
            y_hat = dist.sample()
            all_preds.append(y_hat.cpu())
            all_labels.append(labels.cpu())
    y_hat = torch.concatenate(all_preds, dim=1)
    y = torch.concatenate(all_labels, dim=0)
    y_tile = torch.tile(y, (n_post_samples, 1))
    dis_mat = (y_hat[:, L_train:] != y_tile[:, L_train:])
    dis_rate = dis_mat.sum(dim=-1)/len(dis_mat[0])
    max_dis_rate = torch.max(dis_rate).item()
    idx = torch.argmax(dis_rate).item()
    y_preds = y_hat[idx, :L_train]
    acc = (y_preds == y[:L_train]).float().mean()
    
    return dis_rate, acc, max_dis_rate

In [155]:
all_accs = []
for i in tqdm(range(1000)):
    dis_rate, acc, max_dis_rate = compute_acc_train(monitor, dataset['train'], dataset['dpddm_train'])
    all_accs.append(acc)

  0%|          | 0/1000 [00:00<?, ?it/s]


TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray

In [134]:
all_accs = all_accs.mean(axis=1)
np.mean(all_accs), np.std(all_accs)

(np.float64(0.6487234636871508), np.float64(0.022995452896299586))

In [106]:
n_post_samples = 5000
catloader = torch.utils.data.DataLoader(catset, batch_size=32, shuffle=False)
all_preds = [] 
all_labels = []
monitor.model.eval()
with torch.no_grad():
    for features, labels in catloader:
        features, labels = features.cuda(), labels.cuda()
        out = monitor.model.get_features(features)
        ll_dist = monitor.model.out_layer.logit_predictive(out)
        logits_samples = ll_dist.rsample(sample_shape=torch.Size([n_post_samples]))
        logits_samples = logits_samples / 1
        dist = torch.distributions.Categorical(logits=logits_samples)
        y_hat = dist.sample()
        all_preds.append(y_hat.cpu())
        all_labels.append(labels.cpu())
y_hat = torch.concatenate(all_preds, dim=1)
y = torch.concatenate(all_labels, dim=0)
y_tile = torch.tile(y, (n_post_samples, 1))
y_tile.shape
dis_mat = (y_hat[:, L_train:] != y_tile[:, L_train:])
dis_rate = dis_mat.sum(dim=-1)/len(dis_mat[0])